Lots of small improvements to Mutect2 code

This commit is contained in:
David Benjamin 2016-08-04 01:50:13 -04:00
parent ebceb06284
commit 01142dfb1c
4 changed files with 446 additions and 599 deletions

View File

@ -51,12 +51,41 @@
package org.broadinstitute.gatk.tools.walkers.cancer.m2; package org.broadinstitute.gatk.tools.walkers.cancer.m2;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection;
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.AssemblyBasedCallerArgumentCollection; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.AssemblyBasedCallerArgumentCollection;
import org.broadinstitute.gatk.utils.commandline.Advanced; import org.broadinstitute.gatk.utils.commandline.*;
import org.broadinstitute.gatk.utils.commandline.Argument;
import org.broadinstitute.gatk.utils.commandline.Hidden; import java.util.Collections;
import java.util.List;
public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection { public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection {
/***************************************/
// Reference Metadata inputs
/***************************************/
/**
* MuTect2 has the ability to use COSMIC data in conjunction with dbSNP to adjust the threshold for evidence of a variant
* in the normal. If a variant is present in dbSNP, but not in COSMIC, then more evidence is required from the normal
* sample to prove the variant is not present in germline.
*/
@Input(fullName="cosmic", shortName = "cosmic", doc="VCF file of COSMIC sites", required=false)
public List<RodBinding<VariantContext>> cosmicRod = Collections.emptyList();
/**
* A panel of normals can be a useful (optional) input to help filter out commonly seen sequencing noise that may appear as low allele-fraction somatic variants.
*/
@Input(fullName="normal_panel", shortName = "PON", doc="VCF file of sites observed in normal", required=false)
public List<RodBinding<VariantContext>> normalPanelRod = Collections.emptyList();
/**
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
* dbSNP overlap is only used to require more evidence of absence in the normal if the variant in question has been seen before in germline.
*/
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
@Advanced @Advanced
@Argument(fullName="m2debug", shortName="m2debug", doc="Print out very verbose M2 debug information", required = false) @Argument(fullName="m2debug", shortName="m2debug", doc="Print out very verbose M2 debug information", required = false)
public boolean M2_DEBUG = false; public boolean M2_DEBUG = false;

View File

@ -201,7 +201,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
// the genotyping engine // the genotyping engine
protected HaplotypeCallerGenotypingEngine genotypingEngine = null; protected HaplotypeCallerGenotypingEngine genotypingEngine = null;
private byte MIN_TAIL_QUALITY; private byte MIN_TAIL_QUALITY;
private double log10GlobalReadMismappingRate; private double log10GlobalReadMismappingRate;
@ -214,9 +213,8 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
@ArgumentCollection @ArgumentCollection
protected LikelihoodEngineArgumentCollection LEAC = new LikelihoodEngineArgumentCollection(); protected LikelihoodEngineArgumentCollection LEAC = new LikelihoodEngineArgumentCollection();
@Argument(fullName = "debug_read_name", required = false, doc="trace this read name through the calling process") @Argument(fullName = "debug_read_name", required = false, doc="trace this read name through the calling process")
public String DEBUG_READ_NAME = null; protected String DEBUG_READ_NAME = null;
@Hidden @Hidden
@Advanced @Advanced
@ -224,24 +222,81 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
final public int MQthreshold = 20; final public int MQthreshold = 20;
/***************************************/ public RodBinding<VariantContext> getDbsnpRodBinding() { return MTAC.dbsnp.dbsnp; }
// Reference Metadata inputs
/***************************************/
/**
* MuTect2 has the ability to use COSMIC data in conjunction with dbSNP to adjust the threshold for evidence of a variant
* in the normal. If a variant is present in dbSNP, but not in COSMIC, then more evidence is required from the normal
* sample to prove the variant is not present in germline.
*/
@Input(fullName="cosmic", shortName = "cosmic", doc="VCF file of COSMIC sites", required=false)
public List<RodBinding<VariantContext>> cosmicRod = Collections.emptyList();
/**
* A panel of normals can be a useful (optional) input to help filter out commonly seen sequencing noise that may appear as low allele-fraction somatic variants.
*/
@Input(fullName="normal_panel", shortName = "PON", doc="VCF file of sites observed in normal", required=false)
public List<RodBinding<VariantContext>> normalPanelRod = Collections.emptyList();
private HaplotypeBAMWriter haplotypeBAMWriter; private HaplotypeBAMWriter haplotypeBAMWriter;
/**
* If a call overlaps with a record from the provided comp track, the INFO field will be annotated
* as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field).
* Records that are filtered in the comp track will be ignored.
* Note that 'dbSNP' has been special-cased (see the --dbsnp argument).
*/
@Advanced
@Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false)
public List<RodBinding<VariantContext>> comps = Collections.emptyList();
public List<RodBinding<VariantContext>> getCompRodBindings() { return comps; }
/**
* Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations.
*/
@Advanced
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
protected List<String> annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"DepthPerAlleleBySample", "BaseQualitySumPerAlleleBySample", "TandemRepeatAnnotator", "OxoGReadCounts"}));
/**
* Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments,
* so annotations will be excluded even if they are explicitly included with the other options.
*/
@Advanced
@Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false)
protected List<String> annotationsToExclude = new ArrayList<>(Arrays.asList(new String[]{"SpanningDeletions"}));
/**
* Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups.
*/
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
protected String[] annotationClassesToUse = { };
@Output(doc="File to which variants should be written")
protected VariantContextWriter vcfWriter = null;
/**
* Active region trimmer reference.
*/
@ArgumentCollection
protected ActiveRegionTrimmer trimmer = new ActiveRegionTrimmer();
@Hidden
@Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false)
protected String keepRG = null;
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false)
public byte MIN_BASE_QUALTY_SCORE = 10;
public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.HOSTILE;
@Hidden
@Argument(fullName="errorCorrectReads", shortName="errorCorrectReads", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false)
protected boolean errorCorrectReads = false;
@Hidden
@Argument(fullName="captureAssemblyFailureBAM", shortName="captureAssemblyFailureBAM", doc="If specified, we will write a BAM called assemblyFailure.bam capturing all of the reads that were in the active region when the assembler failed for any reason", required = false)
protected boolean captureAssemblyFailureBAM = false;
@Advanced
@Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="If specified, we will not analyze soft clipped bases in the reads", required = false)
protected boolean dontUseSoftClippedBases = false;
@Hidden
@Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false)
protected boolean justDetermineActiveRegions = false;
// reference base padding size
private static final int REFERENCE_PADDING = 500;
private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6;
private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument
private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument
@Override @Override
public void initialize() { public void initialize() {
@ -250,15 +305,15 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
samplesList = new IndexedSampleList(new ArrayList<>(ReadUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()))); samplesList = new IndexedSampleList(new ArrayList<>(ReadUtils.getSAMFileSamples(getToolkit().getSAMFileHeader())));
// MUTECT: check that we have at least one tumor bam // MUTECT: check that we have at least one tumor bam
for(SAMReaderID id : getToolkit().getReadsDataSource().getReaderIDs()) { for(final SAMReaderID id : getToolkit().getReadsDataSource().getReaderIDs()) {
if (id.getTags().getPositionalTags().size() == 0) { if (id.getTags().getPositionalTags().size() == 0) {
throw new RuntimeException("BAMs must be tagged as either 'tumor' or 'normal'"); throw new RuntimeException("BAMs must be tagged as either 'tumor' or 'normal'");
} }
// only supports single-sample BAMs (ie first read group is representative) // only supports single-sample BAMs (ie first read group is representative)
String bamSampleName = getToolkit().getReadsDataSource().getHeader(id).getReadGroups().get(0).getSample(); final String bamSampleName = getToolkit().getReadsDataSource().getHeader(id).getReadGroups().get(0).getSample();
for(String tag : id.getTags().getPositionalTags()) { for(final String tag : id.getTags().getPositionalTags()) {
if (BAM_TAG_TUMOR.equalsIgnoreCase(tag)) { if (BAM_TAG_TUMOR.equalsIgnoreCase(tag)) {
tumorSAMReaderIDs.add(id); tumorSAMReaderIDs.add(id);
if (tumorSampleName == null) { if (tumorSampleName == null) {
@ -330,9 +385,9 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
final MergeVariantsAcrossHaplotypes variantMerger = new MergeVariantsAcrossHaplotypes(); final MergeVariantsAcrossHaplotypes variantMerger = new MergeVariantsAcrossHaplotypes();
final GenomeAnalysisEngine toolkit = getToolkit(); final GenomeAnalysisEngine toolkit = getToolkit();
final GenomeLocParser genomeLocParser = toolkit.getGenomeLocParser();
genotypingEngine = new SomaticGenotypingEngine( MTAC, samplesList, genomeLocParser, FixedAFCalculatorProvider.createThreadSafeProvider(getToolkit(), MTAC, logger), !doNotRunPhysicalPhasing, MTAC); genotypingEngine = new SomaticGenotypingEngine( MTAC, samplesList, toolkit.getGenomeLocParser(), !doNotRunPhysicalPhasing, MTAC,
tumorSampleName, normalSampleName, DEBUG_READ_NAME);
genotypingEngine.setCrossHaplotypeEventMerger(variantMerger); genotypingEngine.setCrossHaplotypeEventMerger(variantMerger);
genotypingEngine.setAnnotationEngine(annotationEngine); genotypingEngine.setAnnotationEngine(annotationEngine);
@ -353,7 +408,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
trimmer.snpPadding = 50; trimmer.snpPadding = 50;
samplesList = toolkit.getReadSampleList(); samplesList = toolkit.getReadSampleList();
Set<String> sampleSet = SampleListUtils.asSet(samplesList); final Set<String> sampleSet = SampleListUtils.asSet(samplesList);
if( MTAC.CONTAMINATION_FRACTION_FILE != null ) if( MTAC.CONTAMINATION_FRACTION_FILE != null )
MTAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(MTAC.CONTAMINATION_FRACTION_FILE, MTAC.CONTAMINATION_FRACTION, sampleSet, logger)); MTAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(MTAC.CONTAMINATION_FRACTION_FILE, MTAC.CONTAMINATION_FRACTION, sampleSet, logger));
@ -367,7 +422,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
} }
final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
Set<VCFHeaderLine> headerInfo = new HashSet<>(); final Set<VCFHeaderLine> headerInfo = new HashSet<>();
// all annotation fields from VariantAnnotatorEngine // all annotation fields from VariantAnnotatorEngine
headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions()); headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions());
@ -383,7 +438,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
headerInfo.addAll(getM2HeaderLines()); headerInfo.addAll(getM2HeaderLines());
headerInfo.addAll(getSampleHeaderLines()); headerInfo.addAll(getSampleHeaderLines());
List<String> outputSampleNames = getOutputSampleNames(); final List<String> outputSampleNames = getOutputSampleNames();
vcfWriter.writeHeader(new VCFHeader(headerInfo, outputSampleNames)); vcfWriter.writeHeader(new VCFHeader(headerInfo, outputSampleNames));
@ -391,7 +446,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
} }
private Set<VCFHeaderLine> getM2HeaderLines(){ private Set<VCFHeaderLine> getM2HeaderLines(){
Set<VCFHeaderLine> headerInfo = new HashSet<>(); final Set<VCFHeaderLine> headerInfo = new HashSet<>();
headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.NORMAL_LOD_KEY)); headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.NORMAL_LOD_KEY));
headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.TUMOR_LOD_KEY)); headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.TUMOR_LOD_KEY));
headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.PANEL_OF_NORMALS_COUNT_KEY)); headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.PANEL_OF_NORMALS_COUNT_KEY));
@ -430,21 +485,21 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
} }
private Set<VCFHeaderLine> getSampleHeaderLines(){ private Set<VCFHeaderLine> getSampleHeaderLines(){
Set<VCFHeaderLine> sampleLines = new HashSet<>(); final Set<VCFHeaderLine> sampleLines = new HashSet<>();
if (printTCGAsampleHeader) { if (printTCGAsampleHeader) {
//NOTE: This will only list the first bam file for each tumor/normal sample if there is more than one //NOTE: This will only list the first bam file for each tumor/normal sample if there is more than one
Map<String, String> normalSampleHeaderAttributes = new HashMap<>(); final Map<String, String> normalSampleHeaderAttributes = new HashMap<>();
normalSampleHeaderAttributes.put("ID", "NORMAL"); normalSampleHeaderAttributes.put("ID", "NORMAL");
normalSampleHeaderAttributes.put("SampleName", normalSampleName); normalSampleHeaderAttributes.put("SampleName", normalSampleName);
if (normalSAMReaderIDs.iterator().hasNext() && !getToolkit().getArguments().disableCommandLineInVCF) if (normalSAMReaderIDs.iterator().hasNext() && !getToolkit().getArguments().disableCommandLineInVCF)
normalSampleHeaderAttributes.put("File", normalSAMReaderIDs.iterator().next().getSamFilePath()); normalSampleHeaderAttributes.put("File", normalSAMReaderIDs.iterator().next().getSamFilePath());
VCFSimpleHeaderLine normalSampleHeader = new VCFSimpleHeaderLine("SAMPLE", normalSampleHeaderAttributes); final VCFSimpleHeaderLine normalSampleHeader = new VCFSimpleHeaderLine("SAMPLE", normalSampleHeaderAttributes);
Map<String, String> tumorSampleHeaderAttributes = new HashMap<>(); final Map<String, String> tumorSampleHeaderAttributes = new HashMap<>();
tumorSampleHeaderAttributes.put("ID", "TUMOR"); tumorSampleHeaderAttributes.put("ID", "TUMOR");
tumorSampleHeaderAttributes.put("SampleName", tumorSampleName); tumorSampleHeaderAttributes.put("SampleName", tumorSampleName);
if (tumorSAMReaderIDs.iterator().hasNext() && !getToolkit().getArguments().disableCommandLineInVCF) if (tumorSAMReaderIDs.iterator().hasNext() && !getToolkit().getArguments().disableCommandLineInVCF)
tumorSampleHeaderAttributes.put("File", tumorSAMReaderIDs.iterator().next().getSamFilePath()); tumorSampleHeaderAttributes.put("File", tumorSAMReaderIDs.iterator().next().getSamFilePath());
VCFSimpleHeaderLine tumorSampleHeader = new VCFSimpleHeaderLine("SAMPLE", tumorSampleHeaderAttributes); final VCFSimpleHeaderLine tumorSampleHeader = new VCFSimpleHeaderLine("SAMPLE", tumorSampleHeaderAttributes);
sampleLines.add(normalSampleHeader); sampleLines.add(normalSampleHeader);
sampleLines.add(tumorSampleHeader); sampleLines.add(tumorSampleHeader);
@ -455,7 +510,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
private List<String> getOutputSampleNames(){ private List<String> getOutputSampleNames(){
if (printTCGAsampleHeader) { if (printTCGAsampleHeader) {
//Already checked for exactly 1 tumor and 1 normal in printTCGAsampleHeader assignment in initialize() //Already checked for exactly 1 tumor and 1 normal in printTCGAsampleHeader assignment in initialize()
List<String> sampleNamePlaceholders = new ArrayList<>(2); final List<String> sampleNamePlaceholders = new ArrayList<>(2);
sampleNamePlaceholders.add("TUMOR"); sampleNamePlaceholders.add("TUMOR");
sampleNamePlaceholders.add("NORMAL"); sampleNamePlaceholders.add("NORMAL");
return sampleNamePlaceholders; return sampleNamePlaceholders;
@ -466,14 +521,14 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
} }
@Override @Override
public ActivityProfileState isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { public ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
if( context == null || context.getBasePileup().isEmpty() ) if( context == null || context.getBasePileup().isEmpty() )
// if we don't have any data, just abort early // if we don't have any data, just abort early
return new ActivityProfileState(ref.getLocus(), 0.0); return new ActivityProfileState(ref.getLocus(), 0.0);
final Map<String, AlignmentContext> splitContexts = AlignmentContextUtils.splitContextBySampleName(context); final Map<String, AlignmentContext> splitContexts = AlignmentContextUtils.splitContextBySampleName(context);
AlignmentContext tumorContext = splitContexts.get(tumorSampleName); final AlignmentContext tumorContext = splitContexts.get(tumorSampleName);
AlignmentContext normalContext = splitContexts.get(normalSampleName); final AlignmentContext normalContext = splitContexts.get(normalSampleName);
// if there are no tumor reads... there is no activity! // if there are no tumor reads... there is no activity!
if (tumorContext == null) { if (tumorContext == null) {
@ -481,7 +536,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
} }
// KCIBUL -- this method was inlined and modified from ReferenceConfidenceModel // KCIBUL -- this method was inlined and modified from ReferenceConfidenceModel
ReadBackedPileup tumorPileup = tumorContext.getBasePileup().getMappingFilteredPileup(MQthreshold); final ReadBackedPileup tumorPileup = tumorContext.getBasePileup().getMappingFilteredPileup(MQthreshold);
final double[] tumorGLs = calcGenotypeLikelihoodsOfRefVsAny(tumorPileup, ref.getBase(), MIN_BASE_QUALTY_SCORE); final double[] tumorGLs = calcGenotypeLikelihoodsOfRefVsAny(tumorPileup, ref.getBase(), MIN_BASE_QUALTY_SCORE);
final double tumorLod = tumorGLs[1] - tumorGLs[0]; final double tumorLod = tumorGLs[1] - tumorGLs[0];
@ -495,7 +550,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
// in any case, we have to handle the case where there is no normal (and thus no normal context) which is // in any case, we have to handle the case where there is no normal (and thus no normal context) which is
// different than having a normal but having no reads (where we should not enter the active region) // different than having a normal but having no reads (where we should not enter the active region)
if (normalSampleName != null && normalContext != null) { if (normalSampleName != null && normalContext != null) {
int nonRefInNormal = getCountOfNonRefEvents(normalContext.getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE); final int nonRefInNormal = getCountOfNonRefEvents(normalContext.getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE);
final double[] normalGLs = calcGenotypeLikelihoodsOfRefVsAny(normalContext.getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE, 0.5f); final double[] normalGLs = calcGenotypeLikelihoodsOfRefVsAny(normalContext.getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE, 0.5f);
final double normalLod = normalGLs[0] - normalGLs[1]; final double normalLod = normalGLs[0] - normalGLs[1];
@ -528,13 +583,12 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
// No reads here so nothing to do! // No reads here so nothing to do!
if( originalActiveRegion.size() == 0 ) { return referenceModelForNoVariation(originalActiveRegion, true); } if( originalActiveRegion.size() == 0 ) { return referenceModelForNoVariation(originalActiveRegion, true); }
logReadInfo(DEBUG_READ_NAME, originalActiveRegion.getReads(), "Present in original active region"); logReadInfo(DEBUG_READ_NAME, originalActiveRegion.getReads(), "Present in original active region");
// create the assembly using just high quality reads (Q20 or higher). We want to use lower // create the assembly using just high quality reads (Q20 or higher). We want to use lower
// quality reads in the PairHMM (and especially in the normal) later, so we can't use a ReadFilter // quality reads in the PairHMM (and especially in the normal) later, so we can't use a ReadFilter
ActiveRegion assemblyActiveRegion = new ActiveRegion(originalActiveRegion.getLocation(), originalActiveRegion.getSupportingStates(),originalActiveRegion.isActive(), getToolkit().getGenomeLocParser(), originalActiveRegion.getExtension()); final ActiveRegion assemblyActiveRegion = new ActiveRegion(originalActiveRegion.getLocation(), originalActiveRegion.getSupportingStates(),originalActiveRegion.isActive(), getToolkit().getGenomeLocParser(), originalActiveRegion.getExtension());
for (GATKSAMRecord rec : originalActiveRegion.getReads()) { for (final GATKSAMRecord rec : originalActiveRegion.getReads()) {
if (rec.getMappingQuality() >= MQthreshold ) { if (rec.getMappingQuality() >= MQthreshold ) {
assemblyActiveRegion.add(rec); assemblyActiveRegion.add(rec);
} }
@ -558,18 +612,11 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
// Stop the trimming madness!!! // Stop the trimming madness!!!
if (!trimmingResult.isVariationPresent()) if (!trimmingResult.isVariationPresent())
return referenceModelForNoVariation(originalActiveRegion,false); return referenceModelForNoVariation(originalActiveRegion,false);
logReadInfo(DEBUG_READ_NAME, trimmingResult.getCallableRegion().getReads(), "Present in trimming result"); logReadInfo(DEBUG_READ_NAME, trimmingResult.getCallableRegion().getReads(), "Present in trimming result");
final AssemblyResultSet assemblyResult = final AssemblyResultSet assemblyResult =
trimmingResult.needsTrimming() ? untrimmedAssemblyResult.trimTo(trimmingResult.getCallableRegion()) : untrimmedAssemblyResult; trimmingResult.needsTrimming() ? untrimmedAssemblyResult.trimTo(trimmingResult.getCallableRegion()) : untrimmedAssemblyResult;
// final AssemblyResultSet assemblyResult = untrimmedAssemblyResult;
// after talking to Ryan -- they grab the reads out of the assembly (and trim then) to pass into the PairHMM
// because at one point they were trying error correcting of the reads based on the haplotypes.. but that is not
// working out, so it's safe for us just to take the reads
//
final ActiveRegion regionForGenotyping = assemblyResult.getRegionForGenotyping(); final ActiveRegion regionForGenotyping = assemblyResult.getRegionForGenotyping();
logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping"); logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping");
@ -577,10 +624,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
haplotypeBAMWriter.addDroppedReadsFromDelta(DroppedReadsTracker.Reason.TRIMMMED, originalActiveRegion.getReads(), regionForGenotyping.getReads()); haplotypeBAMWriter.addDroppedReadsFromDelta(DroppedReadsTracker.Reason.TRIMMMED, originalActiveRegion.getReads(), regionForGenotyping.getReads());
} }
//
// final ActiveRegion regionForGenotyping = trimmingResult.getCallableRegion();
// final ActiveRegion regionForGenotyping = originalActiveRegion;
// filter out reads from genotyping which fail mapping quality based criteria // filter out reads from genotyping which fail mapping quality based criteria
//TODO - why don't do this before any assembly is done? Why not just once at the beginning of this method //TODO - why don't do this before any assembly is done? Why not just once at the beginning of this method
@ -594,7 +637,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
} }
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample(filteredReads); final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample(filteredReads);
logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping after filtering reads"); logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping after filtering reads");
// abort early if something is out of the acceptable range // abort early if something is out of the acceptable range
@ -612,17 +654,17 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
final List<Haplotype> haplotypes = assemblyResult.getHaplotypeList(); final List<Haplotype> haplotypes = assemblyResult.getHaplotypeList();
final Map<String,List<GATKSAMRecord>> reads = splitReadsBySample( regionForGenotyping.getReads() ); final Map<String,List<GATKSAMRecord>> reads = splitReadsBySample( regionForGenotyping.getReads() );
for (List<GATKSAMRecord> rec : reads.values()) { for (final List<GATKSAMRecord> rec : reads.values()) {
logReadInfo(DEBUG_READ_NAME, rec, "Present after splitting assemblyResult by sample"); logReadInfo(DEBUG_READ_NAME, rec, "Present after splitting assemblyResult by sample");
} }
final HashMap<String, Integer> ARreads_origNormalMQ = new HashMap<>(); final HashMap<String, Integer> ARreads_origNormalMQ = new HashMap<>();
for (GATKSAMRecord read : regionForGenotyping.getReads()) { for (final GATKSAMRecord read : regionForGenotyping.getReads()) {
ARreads_origNormalMQ.put(read.getReadName(), read.getMappingQuality()); ARreads_origNormalMQ.put(read.getReadName(), read.getMappingQuality());
} }
// modify MAPQ scores in normal to be high so that we don't do any base quality score capping // modify MAPQ scores in normal to be high so that we don't do any base quality score capping
for(GATKSAMRecord rec : regionForGenotyping.getReads()) { for(final GATKSAMRecord rec : regionForGenotyping.getReads()) {
if (isReadFromNormal(rec)) { if (isReadFromNormal(rec)) {
rec.setMappingQuality(60); rec.setMappingQuality(60);
} }
@ -647,8 +689,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
} }
readLikelihoods.changeReads(readRealignments); readLikelihoods.changeReads(readRealignments);
for (final GATKSAMRecord rec : readRealignments.keySet()) {
for (GATKSAMRecord rec : readRealignments.keySet()) {
logReadInfo(DEBUG_READ_NAME, rec, "Present after computing read likelihoods"); logReadInfo(DEBUG_READ_NAME, rec, "Present after computing read likelihoods");
} }
@ -666,15 +707,8 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
assemblyResult.getFullReferenceWithPadding(), assemblyResult.getFullReferenceWithPadding(),
assemblyResult.getPaddedReferenceLoc(), assemblyResult.getPaddedReferenceLoc(),
regionForGenotyping.getLocation(), regionForGenotyping.getLocation(),
getToolkit().getGenomeLocParser(),
metaDataTracker, metaDataTracker,
givenAlleles, false , givenAlleles);
tumorSampleName,
normalSampleName,
dbsnp.dbsnp,
cosmicRod,
DEBUG_READ_NAME
);
if ( MTAC.bamWriter != null ) { if ( MTAC.bamWriter != null ) {
final Set<Haplotype> calledHaplotypeSet = new HashSet<>(calledHaplotypes.getCalledHaplotypes()); final Set<Haplotype> calledHaplotypeSet = new HashSet<>(calledHaplotypes.getCalledHaplotypes());
@ -695,16 +729,16 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
if( MTAC.DEBUG ) { logger.info("----------------------------------------------------------------------------------"); } if( MTAC.DEBUG ) { logger.info("----------------------------------------------------------------------------------"); }
List<VariantContext> annotatedCalls = new ArrayList<>(); final List<VariantContext> annotatedCalls = new ArrayList<>();
int eventCount = calledHaplotypes.getCalls().size(); final int eventCount = calledHaplotypes.getCalls().size();
Integer minEventDistance = null; Integer minEventDistance = null;
Integer maxEventDistance = null; Integer maxEventDistance = null;
Integer lastPosition = null; Integer lastPosition = null;
for (VariantContext vc : calledHaplotypes.getCalls()) { for (final VariantContext vc : calledHaplotypes.getCalls()) {
if (lastPosition == null) { if (lastPosition == null) {
lastPosition = vc.getStart(); lastPosition = vc.getStart();
} else { } else {
int dist = Math.abs(vc.getStart() - lastPosition); final int dist = Math.abs(vc.getStart() - lastPosition);
if (maxEventDistance == null || dist > maxEventDistance) { if (maxEventDistance == null || dist > maxEventDistance) {
maxEventDistance = dist; maxEventDistance = dist;
} }
@ -713,23 +747,23 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
} }
} }
} }
Map<String, Object> eventDistanceAttributes = new HashMap<>(); final Map<String, Object> eventDistanceAttributes = new HashMap<>();
eventDistanceAttributes.put(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, eventCount); eventDistanceAttributes.put(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, eventCount);
eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, minEventDistance); eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, minEventDistance);
eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, maxEventDistance); eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, maxEventDistance);
// can we do this with the Annotation classes instead? // can we do this with the Annotation classes instead?
for (VariantContext originalVC : calledHaplotypes.getCalls()) { for (final VariantContext originalVC : calledHaplotypes.getCalls()) {
VariantContextBuilder vcb = new VariantContextBuilder(originalVC); final VariantContextBuilder vcb = new VariantContextBuilder(originalVC);
Map<String, Object> attributes = new HashMap<>(originalVC.getAttributes()); final Map<String, Object> attributes = new HashMap<>(originalVC.getAttributes());
attributes.putAll(eventDistanceAttributes); attributes.putAll(eventDistanceAttributes);
vcb.attributes(attributes); vcb.attributes(attributes);
Set<String> filters = new HashSet<>(originalVC.getFilters()); final Set<String> filters = new HashSet<>(originalVC.getFilters());
double tumorLod = originalVC.getAttributeAsDouble(GATKVCFConstants.TUMOR_LOD_KEY, -1); final double tumorLod = originalVC.getAttributeAsDouble(GATKVCFConstants.TUMOR_LOD_KEY, -1);
if (tumorLod < MTAC.TUMOR_LOD_THRESHOLD) { if (tumorLod < MTAC.TUMOR_LOD_THRESHOLD) {
filters.add(GATKVCFConstants.TUMOR_LOD_FILTER_NAME); filters.add(GATKVCFConstants.TUMOR_LOD_FILTER_NAME);
} }
@ -739,22 +773,12 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
filters.addAll(calculateFilters(metaDataTracker, originalVC, eventDistanceAttributes)); filters.addAll(calculateFilters(metaDataTracker, originalVC, eventDistanceAttributes));
} }
if (filters.size() > 0) { vcb.filters(filters.isEmpty() ? VariantContext.PASSES_FILTERS : filters);
vcb.filters(filters);
} else {
vcb.passFilters();
}
if (printTCGAsampleHeader) { if (printTCGAsampleHeader) {
GenotypesContext genotypesWithBamSampleNames = originalVC.getGenotypes(); final Genotype tumorGenotype = new GenotypeBuilder(originalVC.getGenotype(tumorSampleName)).name("TUMOR").make();
List<Genotype> renamedGenotypes = new ArrayList<>(); final Genotype normalGenotype = new GenotypeBuilder(originalVC.getGenotype(normalSampleName)).name("NORMAL").make();
GenotypeBuilder GTbuilder = new GenotypeBuilder(genotypesWithBamSampleNames.get(tumorSampleName)); vcb.genotypes(Arrays.asList(tumorGenotype, normalGenotype));
GTbuilder.name("TUMOR");
renamedGenotypes.add(GTbuilder.make());
GTbuilder = new GenotypeBuilder(genotypesWithBamSampleNames.get(normalSampleName));
GTbuilder.name("NORMAL");
renamedGenotypes.add(GTbuilder.make());
vcb.genotypes(renamedGenotypes);
} }
annotatedCalls.add(vcb.make()); annotatedCalls.add(vcb.make());
@ -765,15 +789,15 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
return annotatedCalls; return annotatedCalls;
} }
private Set<String> calculateFilters(RefMetaDataTracker metaDataTracker, VariantContext vc, Map<String, Object> eventDistanceAttributes) { private Set<String> calculateFilters(final RefMetaDataTracker metaDataTracker, final VariantContext vc, final Map<String, Object> eventDistanceAttributes) {
Set<String> filters = new HashSet<>(); final Set<String> filters = new HashSet<>();
Integer eventCount = (Integer) eventDistanceAttributes.get(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY); final Integer eventCount = (Integer) eventDistanceAttributes.get(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY);
Integer maxEventDistance = (Integer) eventDistanceAttributes.get(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY); final Integer maxEventDistance = (Integer) eventDistanceAttributes.get(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY);
Collection<VariantContext> panelOfNormalsVC = metaDataTracker.getValues(normalPanelRod, final Collection<VariantContext> panelOfNormalsVC = metaDataTracker.getValues(MTAC.normalPanelRod,
getToolkit().getGenomeLocParser().createGenomeLoc(vc.getChr(), vc.getStart())); getToolkit().getGenomeLocParser().createGenomeLoc(vc.getChr(), vc.getStart()));
VariantContext ponVc = panelOfNormalsVC.isEmpty()?null:panelOfNormalsVC.iterator().next(); final VariantContext ponVc = panelOfNormalsVC.isEmpty()?null:panelOfNormalsVC.iterator().next();
if (ponVc != null) { if (ponVc != null) {
filters.add(GATKVCFConstants.PON_FILTER_NAME); filters.add(GATKVCFConstants.PON_FILTER_NAME);
@ -788,13 +812,13 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
double normalF = 0; double normalF = 0;
int normalAltQualityScoreSum = 0; int normalAltQualityScoreSum = 0;
if (hasNormal()) { if (hasNormal()) {
Genotype normalGenotype = vc.getGenotype(normalSampleName); final Genotype normalGenotype = vc.getGenotype(normalSampleName);
// NOTE: how do we get the non-ref depth here? // NOTE: how do we get the non-ref depth here?
normalAltCounts = normalGenotype.getAD()[1]; normalAltCounts = normalGenotype.getAD()[1];
normalF = (Double) normalGenotype.getExtendedAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY); normalF = (Double) normalGenotype.getExtendedAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY);
Object qss = normalGenotype.getExtendedAttribute(GATKVCFConstants.QUALITY_SCORE_SUM_KEY); final Object qss = normalGenotype.getExtendedAttribute(GATKVCFConstants.QUALITY_SCORE_SUM_KEY);
if (qss != null) { if (qss != null) {
normalAltQualityScoreSum = (Integer) ((Object[]) qss)[1]; normalAltQualityScoreSum = (Integer) ((Object[]) qss)[1];
} else { } else {
@ -819,11 +843,11 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
// such as ACTACTACT -> ACTACT, are overwhelmingly false positives so we // such as ACTACTACT -> ACTACT, are overwhelmingly false positives so we
// hard filter them out by default // hard filter them out by default
if (vc.isIndel()) { if (vc.isIndel()) {
ArrayList rpa = (ArrayList) vc.getAttribute(GATKVCFConstants.REPEATS_PER_ALLELE_KEY); final ArrayList rpa = (ArrayList) vc.getAttribute(GATKVCFConstants.REPEATS_PER_ALLELE_KEY);
String ru = vc.getAttributeAsString(GATKVCFConstants.REPEAT_UNIT_KEY, ""); final String ru = vc.getAttributeAsString(GATKVCFConstants.REPEAT_UNIT_KEY, "");
if (rpa != null && rpa.size() > 1 && ru.length() > 1) { if (rpa != null && rpa.size() > 1 && ru.length() > 1) {
int refCount = (Integer) rpa.get(0); final int refCount = (Integer) rpa.get(0);
int altCount = (Integer) rpa.get(1); final int altCount = (Integer) rpa.get(1);
if (refCount - altCount == 1) { if (refCount - altCount == 1) {
filters.add(GATKVCFConstants.STR_CONTRACTION_FILTER_NAME); filters.add(GATKVCFConstants.STR_CONTRACTION_FILTER_NAME);
@ -840,10 +864,10 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
// clustered read position filter // clustered read position filter
if (MTAC.ENABLE_CLUSTERED_READ_POSITION_FILTER){ if (MTAC.ENABLE_CLUSTERED_READ_POSITION_FILTER){
Double tumorFwdPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY); final Double tumorFwdPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY);
Double tumorRevPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_RIGHT_OFFSET_KEY); final Double tumorRevPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_RIGHT_OFFSET_KEY);
Double tumorFwdPosMAD = (Double) vc.getAttribute(GATKVCFConstants.MAD_MEDIAN_LEFT_OFFSET_KEY); final Double tumorFwdPosMAD = (Double) vc.getAttribute(GATKVCFConstants.MAD_MEDIAN_LEFT_OFFSET_KEY);
Double tumorRevPosMAD = (Double) vc.getAttribute(GATKVCFConstants.MAD_MEDIAN_RIGHT_OFFSET_KEY); final Double tumorRevPosMAD = (Double) vc.getAttribute(GATKVCFConstants.MAD_MEDIAN_RIGHT_OFFSET_KEY);
//If the variant is near the read end (median threshold) and the positions are very similar (MAD threshold) then filter //If the variant is near the read end (median threshold) and the positions are very similar (MAD threshold) then filter
if ( (tumorFwdPosMedian != null && tumorFwdPosMedian <= MTAC.PIR_MEDIAN_THRESHOLD && tumorFwdPosMAD != null && tumorFwdPosMAD <= MTAC.PIR_MAD_THRESHOLD) || if ( (tumorFwdPosMedian != null && tumorFwdPosMedian <= MTAC.PIR_MEDIAN_THRESHOLD && tumorFwdPosMAD != null && tumorFwdPosMAD <= MTAC.PIR_MAD_THRESHOLD) ||
(tumorRevPosMedian != null && tumorRevPosMedian <= MTAC.PIR_MEDIAN_THRESHOLD && tumorRevPosMAD != null && tumorRevPosMAD <= MTAC.PIR_MAD_THRESHOLD)) (tumorRevPosMedian != null && tumorRevPosMedian <= MTAC.PIR_MEDIAN_THRESHOLD && tumorRevPosMAD != null && tumorRevPosMAD <= MTAC.PIR_MAD_THRESHOLD))
@ -866,14 +890,15 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
*/ */
protected double[] calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual, final double f) { protected double[] calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual, final double f) {
final double[] genotypeLikelihoods = new double[2]; final double[] genotypeLikelihoods = new double[2];
int AA = 0, AB=1; final int AA = 0;
final int AB=1;
for( final PileupElement p : pileup ) { for( final PileupElement p : pileup ) {
final byte qual = (p.isDeletion() ? REF_MODEL_DELETION_QUAL : p.getQual()); final byte qual = (p.isDeletion() ? REF_MODEL_DELETION_QUAL : p.getQual());
if( p.isDeletion() || qual > minBaseQual ) { if( p.isDeletion() || qual > minBaseQual ) {
// TODO: why not use base qualities here? // TODO: why not use base qualities here?
//double pobs = QualityUtils.qualToErrorProbLog10(qual); //double pobs = QualityUtils.qualToErrorProbLog10(qual);
double pobs = 1.0d - pow(10, (30 / -10.0)); final double pobs = 1.0d - pow(10, (30 / -10.0));
if( isNonRef(refBase, p)) { if( isNonRef(refBase, p)) {
genotypeLikelihoods[AB] += Math.log10(f*pobs + (1-f)*pobs/3.0d); genotypeLikelihoods[AB] += Math.log10(f*pobs + (1-f)*pobs/3.0d);
genotypeLikelihoods[AA] += Math.log10((1-pobs)/3); genotypeLikelihoods[AA] += Math.log10((1-pobs)/3);
@ -905,7 +930,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
} }
protected double[] calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual) { protected double[] calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual) {
double f = calculateF(pileup, refBase, minBaseQual); final double f = calculateF(pileup, refBase, minBaseQual);
return calcGenotypeLikelihoodsOfRefVsAny(pileup, refBase, minBaseQual, f); return calcGenotypeLikelihoodsOfRefVsAny(pileup, refBase, minBaseQual, f);
} }
@ -923,11 +948,10 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
} }
} }
} }
double f = (double) altCount / ((double) refCount + (double) altCount); return (double) altCount / (refCount + altCount);
return f;
} }
private boolean isNonRef(byte refBase, PileupElement p) { private boolean isNonRef(final byte refBase, final PileupElement p) {
return p.getBase() != refBase || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip(); return p.getBase() != refBase || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip();
} }
@ -960,8 +984,8 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
return readsToRemove; return readsToRemove;
} }
private static GATKSAMRecord findReadByName(Collection<GATKSAMRecord> reads, String name) { private static GATKSAMRecord findReadByName(final Collection<GATKSAMRecord> reads, final String name) {
for(GATKSAMRecord read : reads) { for(final GATKSAMRecord read : reads) {
if (name.equals(read.getReadName())) return read; if (name.equals(read.getReadName())) return read;
} }
return null; return null;
@ -1022,7 +1046,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
} }
@Override @Override
public Integer reduce(List<VariantContext> callsInRegion, Integer numCalledRegions) { public Integer reduce(final List<VariantContext> callsInRegion, final Integer numCalledRegions) {
for( final VariantContext call : callsInRegion ) { for( final VariantContext call : callsInRegion ) {
vcfWriter.add( call ); vcfWriter.add( call );
} }
@ -1030,7 +1054,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
} }
@Override @Override
public void onTraversalDone(Integer result) { public void onTraversalDone(final Integer result) {
// if ( SCAC.emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) ((GVCFWriter)vcfWriter).close(false); // GROSS -- engine forces us to close our own VCF writer since we wrapped it // if ( SCAC.emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) ((GVCFWriter)vcfWriter).close(false); // GROSS -- engine forces us to close our own VCF writer since we wrapped it
// referenceConfidenceModel.close(); // referenceConfidenceModel.close();
//TODO remove the need to call close here for debugging, the likelihood output stream should be managed //TODO remove the need to call close here for debugging, the likelihood output stream should be managed
@ -1045,114 +1069,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); } public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); }
public boolean alwaysAppendDbsnpId() { return false; } public boolean alwaysAppendDbsnpId() { return false; }
/**
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
* dbSNP overlap is only used to require more evidence of absence in the normal if the variant in question has been seen before in germline.
*/
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
public RodBinding<VariantContext> getDbsnpRodBinding() { return dbsnp.dbsnp; }
/**
* If a call overlaps with a record from the provided comp track, the INFO field will be annotated
* as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field).
* Records that are filtered in the comp track will be ignored.
* Note that 'dbSNP' has been special-cased (see the --dbsnp argument).
*/
@Advanced
@Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false)
public List<RodBinding<VariantContext>> comps = Collections.emptyList();
public List<RodBinding<VariantContext>> getCompRodBindings() { return comps; }
/**
* Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations.
*/
@Advanced
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
// protected List<String> annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"ClippingRankSumTest", "DepthPerSampleHC"}));
// protected List<String> annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"DepthPerAlleleBySample", "BaseQualitySumPerAlleleBy ruSample", "TandemRepeatAnnotator",
// "RMSMappingQuality","MappingQualityRankSumTest","FisherStrand","StrandOddsRatio","ReadPosRankSumTest","QualByDepth", "Coverage"}));
protected List<String> annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"DepthPerAlleleBySample", "BaseQualitySumPerAlleleBySample", "TandemRepeatAnnotator", "OxoGReadCounts"}));
/**
* Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments,
* so annotations will be excluded even if they are explicitly included with the other options.
*/
@Advanced
@Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false)
protected List<String> annotationsToExclude = new ArrayList<>(Arrays.asList(new String[]{"SpanningDeletions"}));
/**
* Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups.
*/
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
//protected String[] annotationGroupsToUse = { StandardAnnotation.class.getSimpleName() };
protected String[] annotationClassesToUse = { };
/**
* A raw, unfiltered, highly sensitive callset in VCF format.
*/
@Output(doc="File to which variants should be written")
protected VariantContextWriter vcfWriter = null;
/**
* Active region trimmer reference.
*/
@ArgumentCollection
protected ActiveRegionTrimmer trimmer = new ActiveRegionTrimmer();
@Hidden
@Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false)
protected String keepRG = null;
/**
* The minimum confidence needed for a given base for it to be used in variant calling.
*/
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false)
public byte MIN_BASE_QUALTY_SCORE = 10;
// PAIR-HMM-Related Goodness
// public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE;
// public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.AGGRESSIVE;
public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.HOSTILE;
// Parameters to control read error correction
@Hidden
@Argument(fullName="errorCorrectReads", shortName="errorCorrectReads", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false)
protected boolean errorCorrectReads = false;
@Hidden
@Argument(fullName="captureAssemblyFailureBAM", shortName="captureAssemblyFailureBAM", doc="If specified, we will write a BAM called assemblyFailure.bam capturing all of the reads that were in the active region when the assembler failed for any reason", required = false)
protected boolean captureAssemblyFailureBAM = false;
@Advanced
@Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="If specified, we will not analyze soft clipped bases in the reads", required = false)
protected boolean dontUseSoftClippedBases = false;
@Hidden
@Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false)
protected boolean justDetermineActiveRegions = false;
// reference base padding size
private static final int REFERENCE_PADDING = 500;
private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6;
private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument
private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument
/** /**
* High-level function that runs the assembler on the active region reads, * High-level function that runs the assembler on the active region reads,
* returning a data structure with the resulting information needed * returning a data structure with the resulting information needed
@ -1224,14 +1140,11 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
if( !clippedRead.isEmpty() && clippedRead.getCigar().getReadLength() > 0 ) { if( !clippedRead.isEmpty() && clippedRead.getCigar().getReadLength() > 0 ) {
clippedRead = ReadClipper.hardClipToRegion(clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop()); clippedRead = ReadClipper.hardClipToRegion(clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop());
if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) { if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) {
//logger.info("Keeping read " + clippedRead + " start " + clippedRead.getAlignmentStart() + " end " + clippedRead.getAlignmentEnd());
readsToUse.add(clippedRead); readsToUse.add(clippedRead);
} }
} }
} }
// TODO -- Performance optimization: we partition the reads by sample 4 times right now; let's unify that code.
final List<GATKSAMRecord> downsampledReads = DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart); final List<GATKSAMRecord> downsampledReads = DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart);
if ( MTAC.bamWriter != null && MTAC.emitDroppedReads ) { if ( MTAC.bamWriter != null && MTAC.emitDroppedReads ) {
@ -1268,9 +1181,9 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
* *
* @param reads the list of reads to consider * @param reads the list of reads to consider
*/ */
private void cleanOverlappingReadPairs(final List<GATKSAMRecord> reads, Set<String> normalSampleNames) { private void cleanOverlappingReadPairs(final List<GATKSAMRecord> reads, final Set<String> normalSampleNames) {
Map<String, List<GATKSAMRecord>> data = splitReadsBySample(reads); final Map<String, List<GATKSAMRecord>> data = splitReadsBySample(reads);
for ( String sampleName : data.keySet() ) { for ( final String sampleName : data.keySet() ) {
final boolean isTumor = !normalSampleNames.contains(sampleName); final boolean isTumor = !normalSampleNames.contains(sampleName);
final List<GATKSAMRecord> perSampleReadList = data.get(sampleName); final List<GATKSAMRecord> perSampleReadList = data.get(sampleName);
@ -1284,16 +1197,15 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
} }
} }
public static void logReadInfo(String readName, Collection<GATKSAMRecord> records, String message) { public static void logReadInfo(final String readName, final Collection<GATKSAMRecord> records, final String message) {
if (readName != null) { if (readName != null) {
for (GATKSAMRecord rec : records) { for (final GATKSAMRecord rec : records) {
logReadInfo(readName, rec, message); logReadInfo(readName, rec, message);
} }
} }
} }
public static void logReadInfo(String readName, GATKSAMRecord rec, String message) { public static void logReadInfo(final String readName, final GATKSAMRecord rec, final String message) {
if (readName != null && rec != null && readName.equals(rec.getReadName())) { if (readName != null && rec != null && readName.equals(rec.getReadName())) {
logger.info("Found " + rec.toString() + " - " + message); logger.info("Found " + rec.toString() + " - " + message);
} }
@ -1322,7 +1234,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
return result; return result;
} }
private boolean isReadFromNormal(GATKSAMRecord rec) { private boolean isReadFromNormal(final GATKSAMRecord rec) {
return normalSampleName != null && normalSampleName.equals(rec.getReadGroup().getSample()); return normalSampleName != null && normalSampleName.equals(rec.getReadGroup().getSample());
} }
@ -1338,7 +1250,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
@Advanced @Advanced
@Argument(fullName="doNotRunPhysicalPhasing", shortName="doNotRunPhysicalPhasing", doc="Disable physical phasing", required = false) @Argument(fullName="doNotRunPhysicalPhasing", shortName="doNotRunPhysicalPhasing", doc="Disable physical phasing", required = false)
protected boolean doNotRunPhysicalPhasing = false; protected boolean doNotRunPhysicalPhasing = false;
} }

View File

@ -54,9 +54,11 @@ package org.broadinstitute.gatk.tools.walkers.cancer.m2;
import com.google.java.contract.Ensures; import com.google.java.contract.Ensures;
import htsjdk.samtools.util.StringUtil; import htsjdk.samtools.util.StringUtil;
import htsjdk.variant.variantcontext.*; import htsjdk.variant.variantcontext.*;
import org.apache.commons.collections.ListUtils;
import org.apache.commons.lang.mutable.MutableDouble; import org.apache.commons.lang.mutable.MutableDouble;
import org.apache.commons.lang.mutable.MutableInt; import org.apache.commons.lang.mutable.MutableInt;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculator;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvider; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvider;
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCallerGenotypingEngine; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCallerGenotypingEngine;
import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLoc;
@ -78,16 +80,33 @@ import java.util.*;
public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine { public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
protected final M2ArgumentCollection MTAC; private final M2ArgumentCollection MTAC;
private final TumorPowerCalculator strandArtifactPowerCalculator; private final TumorPowerCalculator strandArtifactPowerCalculator;
final boolean REF_AND_ALT = false;
final boolean ALT_ONLY = true; private final String tumorSampleName;
private final String matchedNormalSampleName;
private final String DEBUG_READ_NAME;
// {@link GenotypingEngine} requires a non-null {@link AFCalculatorProvider} but this class doesn't need it. Thus we make a dummy
private static AFCalculatorProvider DUMMY_AF_CALCULATOR_PROVIDER = new AFCalculatorProvider() {
public AFCalculator getInstance(final int ploidy, final int maximumAltAlleles) { return null; }
};
private final static Logger logger = Logger.getLogger(SomaticGenotypingEngine.class); private final static Logger logger = Logger.getLogger(SomaticGenotypingEngine.class);
public SomaticGenotypingEngine(final M2ArgumentCollection configuration, final SampleList samples, final GenomeLocParser genomeLocParser, final AFCalculatorProvider afCalculatorProvider, final boolean doPhysicalPhasing, final M2ArgumentCollection MTAC) { public SomaticGenotypingEngine(final M2ArgumentCollection configuration,
super(configuration, samples, genomeLocParser, afCalculatorProvider, doPhysicalPhasing); final SampleList samples,
final GenomeLocParser genomeLocParser,
final boolean doPhysicalPhasing,
final M2ArgumentCollection MTAC,
final String tumorSampleName,
final String matchedNormalSampleName,
final String DEBUG_READ_NAME) {
super(configuration, samples, genomeLocParser, DUMMY_AF_CALCULATOR_PROVIDER, doPhysicalPhasing);
this.MTAC = MTAC; this.MTAC = MTAC;
this.tumorSampleName = tumorSampleName;
this.matchedNormalSampleName = matchedNormalSampleName;
this.DEBUG_READ_NAME = DEBUG_READ_NAME;
// coverage related initialization // coverage related initialization
final double errorProbability = Math.pow(10, -(MTAC.POWER_CONSTANT_QSCORE/10)); final double errorProbability = Math.pow(10, -(MTAC.POWER_CONSTANT_QSCORE/10));
strandArtifactPowerCalculator = new TumorPowerCalculator(errorProbability, MTAC.STRAND_ARTIFACT_LOD_THRESHOLD, 0.0f); strandArtifactPowerCalculator = new TumorPowerCalculator(errorProbability, MTAC.STRAND_ARTIFACT_LOD_THRESHOLD, 0.0f);
@ -105,37 +124,22 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
* @param ref Reference bytes at active region * @param ref Reference bytes at active region
* @param refLoc Corresponding active region genome location * @param refLoc Corresponding active region genome location
* @param activeRegionWindow Active window * @param activeRegionWindow Active window
* @param genomeLocParser GenomeLocParser
* @param activeAllelesToGenotype Alleles to genotype * @param activeAllelesToGenotype Alleles to genotype
* @param emitReferenceConfidence whether we should add a &lt;NON_REF&gt; alternative allele to the result variation contexts.
* *
* @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes * @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes
* *
*/ */
// @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
@Ensures("result != null")
// TODO - can this be refactored? this is hard to follow! // TODO - can this be refactored? this is hard to follow!
public CalledHaplotypes callMutations ( public CalledHaplotypes callMutations (
final List<Haplotype> haplotypes, final List<Haplotype> haplotypes,
//final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap, final ReadLikelihoods<Haplotype> readLikelihoods,
final ReadLikelihoods<Haplotype> readLikelihoods, final Map<String, Integer> originalNormalReadQualities,
final Map<String, Integer> originalNormalReadQualities, final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList, final byte[] ref,
final byte[] ref, final GenomeLoc refLoc,
final GenomeLoc refLoc, final GenomeLoc activeRegionWindow,
final GenomeLoc activeRegionWindow, final RefMetaDataTracker tracker,
final GenomeLocParser genomeLocParser, final List<VariantContext> activeAllelesToGenotype) {
final RefMetaDataTracker tracker,
final List<VariantContext> activeAllelesToGenotype,
final boolean emitReferenceConfidence,
final String tumorSampleName,
final String matchedNormalSampleName,
final RodBinding<VariantContext> dbsnpRod,
final List<RodBinding<VariantContext>> cosmicRod,
final String DEBUG_READ_NAME
) {
// sanity check input arguments // sanity check input arguments
if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes); if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes);
if (readLikelihoods == null || readLikelihoods.sampleCount() == 0) throw new IllegalArgumentException("readLikelihoods input should be non-empty and non-null, got "+readLikelihoods); if (readLikelihoods == null || readLikelihoods.sampleCount() == 0) throw new IllegalArgumentException("readLikelihoods input should be non-empty and non-null, got "+readLikelihoods);
@ -143,8 +147,6 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
if (refLoc == null || refLoc.size() != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc); if (refLoc == null || refLoc.size() != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc);
if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow); if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow);
if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype); if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype);
if (genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser must be non-null, got "+genomeLocParser);
// Somatic Tumor/Normal Sample Handling // Somatic Tumor/Normal Sample Handling
verifySamplePresence(tumorSampleName, readLikelihoods.samples()); verifySamplePresence(tumorSampleName, readLikelihoods.samples());
@ -159,293 +161,217 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
final List<VariantContext> returnCalls = new ArrayList<>(); final List<VariantContext> returnCalls = new ArrayList<>();
for( final int loc : startPosKeySet ) { for( final int loc : startPosKeySet ) {
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region if( loc < activeRegionWindow.getStart() || loc > activeRegionWindow.getStop() ) {
final List<VariantContext> eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype); continue;
if( eventsAtThisLoc.isEmpty() ) { continue; }
// Create the event mapping object which maps the original haplotype events to the events present at just this locus
final Map<Event, List<Haplotype>> eventMapper = createEventMapper(loc, eventsAtThisLoc, haplotypes);
// Sanity check the priority list for mistakes
final List<String> priorityList = makePriorityList(eventsAtThisLoc);
// Merge the event to find a common reference representation
VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList,
GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
if( mergedVC == null ) { continue; }
if (emitReferenceConfidence)
mergedVC = addNonRefSymbolicAllele(mergedVC);
final Map<VariantContext, Allele> mergeMap = new LinkedHashMap<>();
mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele
for(int iii = 0; iii < eventsAtThisLoc.size(); iii++) {
mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function
}
final Map<Allele, List<Haplotype>> alleleMapper = createAlleleMapper(mergeMap, eventMapper);
if( configuration.DEBUG && logger != null ) {
if (logger != null) logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
}
ReadLikelihoods<Allele> readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION));
//LDG: do we want to do this before or after pulling out overlapping reads?
if (MTAC.isSampleContaminationPresent())
readAlleleLikelihoods.contaminationDownsampling(MTAC.getSampleContamination());
//if (!mergedVC.isBiallelic()) {
// logger.info("[UNSUPPORTED] Detected non-Biallelic VC" + mergedVC.toString());
// continue;
//}
// TODO: once tests are passing, refactor to use the new data structure (not the deprecated one)
// handle overlapping fragments
// TODO: CONFIRM WITH GSA IF IT IS OK TO REMOVE READS FROM THE PRALM (should be... they do it in filterPoorlyModeledReads!)
PerReadAlleleLikelihoodMap tumorPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(tumorSampleName));
filterPRALMForOverlappingReads(tumorPRALM, mergedVC.getReference(), loc, false);
MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in Tumor PRALM after filtering for overlapping reads");
// extend to multiple samples
// compute tumor LOD for each alternate allele
// TODO: somewhere we have to ensure that the all the alleles in the variant context is in alleleFractions passed to getHetGenotypeLogLikelihoods. getHetGenotypeLogLikelihoods will not check that for you
final PerAlleleCollection<Double> altAlleleFractions = estimateAlleleFraction(mergedVC, tumorPRALM, false);
final PerAlleleCollection<Double> tumorHetGenotypeLLs = getHetGenotypeLogLikelihoods(mergedVC, tumorPRALM, originalNormalReadQualities, altAlleleFractions);
if( configuration.DEBUG && logger != null ) {
StringBuilder outputSB = new StringBuilder("Calculated allelic fraction at " + loc + " = [");
for (Allele allele : altAlleleFractions.getAltAlleles()){
outputSB.append( allele + ": " + altAlleleFractions.getAlt(allele) + ", ");
}
outputSB.append("]");
logger.info(outputSB.toString());
}
final PerAlleleCollection<Double> tumorLods = PerAlleleCollection.createPerAltAlleleCollection();
for (Allele altAllele : mergedVC.getAlternateAlleles()){
tumorLods.set(altAllele, tumorHetGenotypeLLs.get(altAllele) - tumorHetGenotypeLLs.getRef());
}
if (configuration.DEBUG && logger != null) {
StringBuilder outputSB = new StringBuilder("Tumor LOD at " + loc + " = [");
for (Allele altAllele : tumorLods.getAltAlleles()) {
outputSB.append( altAllele + ": " + tumorLods.getAlt(altAllele) + ", ");
}
outputSB.append("]");
logger.info(outputSB.toString());
}
double INIT_NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE;
double NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE;
PerReadAlleleLikelihoodMap normalPRALM = null;
PerAlleleCollection<Double> normalLods = PerAlleleCollection.createPerAltAlleleCollection();
// if normal bam is available, compute normal LOD
if (hasNormal) {
normalPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(matchedNormalSampleName));
filterPRALMForOverlappingReads(normalPRALM, mergedVC.getReference(), loc, true);
MuTect2.logReadInfo(DEBUG_READ_NAME, normalPRALM.getLikelihoodReadMap().keySet(), "Present after in Nomral PRALM filtering for overlapping reads");
GenomeLoc eventGenomeLoc = genomeLocParser.createGenomeLoc(activeRegionWindow.getContig(), loc);
Collection<VariantContext> cosmicVC = tracker.getValues(cosmicRod, eventGenomeLoc);
Collection<VariantContext> dbsnpVC = tracker.getValues(dbsnpRod, eventGenomeLoc);
// remove the effect of cosmic from dbSNP
final boolean germlineAtRisk = (!dbsnpVC.isEmpty() && cosmicVC.isEmpty());
INIT_NORMAL_LOD_THRESHOLD = MTAC.INITIAL_NORMAL_LOD_THRESHOLD; //only set this if this job has a normal
NORMAL_LOD_THRESHOLD = (germlineAtRisk)?MTAC.NORMAL_DBSNP_LOD_THRESHOLD:MTAC.NORMAL_LOD_THRESHOLD;
// compute normal LOD = LL(X|REF)/LL(X|ALT) where ALT is the diploid HET with AF = 0.5
// note normal LOD is REF over ALT, the reciprocal of the tumor LOD
final PerAlleleCollection<Double> diploidHetAlleleFractions = PerAlleleCollection.createPerRefAndAltAlleleCollection();
for (final Allele allele : mergedVC.getAlternateAlleles()){
diploidHetAlleleFractions.setAlt(allele, 0.5);
}
final PerAlleleCollection<Double> normalGenotypeLLs = getHetGenotypeLogLikelihoods(mergedVC, normalPRALM, originalNormalReadQualities, diploidHetAlleleFractions);
for (final Allele altAllele : mergedVC.getAlternateAlleles()){
normalLods.setAlt(altAllele, normalGenotypeLLs.getRef() - normalGenotypeLLs.getAlt(altAllele));
}
}
int numPassingAlts = 0;
Set<Allele> allelesThatPassThreshold = new HashSet<>();
Allele alleleWithHighestTumorLOD = null;
// TODO: use lambda
for (Allele altAllele : mergedVC.getAlternateAlleles()) {
final boolean passesTumorLodThreshold = tumorLods.getAlt(altAllele) >= MTAC.INITIAL_TUMOR_LOD_THRESHOLD;
final boolean passesNormalLodThreshold = hasNormal ? normalLods.getAlt(altAllele) >= INIT_NORMAL_LOD_THRESHOLD : true;
if (passesTumorLodThreshold && passesNormalLodThreshold) {
numPassingAlts++;
allelesThatPassThreshold.add(altAllele);
if (alleleWithHighestTumorLOD == null
|| tumorLods.getAlt(altAllele) > tumorLods.getAlt(alleleWithHighestTumorLOD)){
alleleWithHighestTumorLOD = altAllele;
}
}
}
final boolean emitVariant = numPassingAlts > 0;
VariantContext call = null;
if (emitVariant) {
VariantContextBuilder callVcb = new VariantContextBuilder(mergedVC);
// FIXME: can simply get first alternate since above we only deal with Bi-allelic sites...
int haplotypeCount = alleleMapper.get(mergedVC.getAlternateAllele(0)).size();
callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount);
callVcb.attribute(GATKVCFConstants.TUMOR_LOD_KEY, tumorLods.getAlt(alleleWithHighestTumorLOD));
if (hasNormal) {
callVcb.attribute(GATKVCFConstants.NORMAL_LOD_KEY, normalLods.getAlt(alleleWithHighestTumorLOD));
if (normalLods.getAlt(alleleWithHighestTumorLOD) < NORMAL_LOD_THRESHOLD) {
callVcb.filter(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME);
}
}
// M1-style strand artifact filter
// TODO: move code to MuTect2::calculateFilters()
// skip if VC has multiple alleles - it will get filtered later anyway
if (MTAC.ENABLE_STRAND_ARTIFACT_FILTER && numPassingAlts == 1) {
final PerReadAlleleLikelihoodMap forwardPRALM = new PerReadAlleleLikelihoodMap();
final PerReadAlleleLikelihoodMap reversePRALM = new PerReadAlleleLikelihoodMap();
splitPRALMintoForwardAndReverseReads(tumorPRALM, forwardPRALM, reversePRALM);
MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in tumor PRALM after PRALM is split");
MuTect2.logReadInfo(DEBUG_READ_NAME, forwardPRALM.getLikelihoodReadMap().keySet(), "Present in forward PRALM after PRALM is split");
MuTect2.logReadInfo(DEBUG_READ_NAME, reversePRALM.getLikelihoodReadMap().keySet(), "Present in reverse PRALM after PRALM is split");
// TODO: build a new type for probability, likelihood, and log_likelihood. e.g. f_fwd :: probability[], tumorGLs_fwd :: likelihood[]
// TODO: don't want to call getHetGenotypeLogLikelihoods on more than one alternate alelle. May need to overload it to take a scalar f_fwd.
final PerAlleleCollection<Double> alleleFractionsForward = estimateAlleleFraction(mergedVC, forwardPRALM, true);
final PerAlleleCollection<Double> tumorGenotypeLLForward = getHetGenotypeLogLikelihoods(mergedVC, forwardPRALM, originalNormalReadQualities, alleleFractionsForward);
final PerAlleleCollection<Double> alleleFractionsReverse = estimateAlleleFraction(mergedVC, reversePRALM, true);
final PerAlleleCollection<Double> tumorGenotypeLLReverse = getHetGenotypeLogLikelihoods(mergedVC, reversePRALM, originalNormalReadQualities, alleleFractionsReverse);
if( configuration.DEBUG && logger != null ) {
StringBuilder forwardMessage = new StringBuilder("Calculated forward allelic fraction at " + loc + " = [");
StringBuilder reverseMessage = new StringBuilder("Calculated reverse allelic fraction at " + loc + " = [");
for (Allele altAllele : altAlleleFractions.getAltAlleles()){
forwardMessage.append( altAllele + ": " + alleleFractionsForward.getAlt(altAllele) + ", ");
reverseMessage.append( altAllele + ": " + alleleFractionsReverse.getAlt(altAllele) + ", ");
}
forwardMessage.append("]");
reverseMessage.append("]");
logger.info(forwardMessage.toString());
logger.info(reverseMessage.toString());
}
double tumorLod_fwd = tumorGenotypeLLForward.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLForward.getRef();
double tumorLod_rev = tumorGenotypeLLReverse.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLReverse.getRef();
double tumorSBpower_fwd = 0.0;
double tumorSBpower_rev = 0.0;
try {
// Note that we use the observed combined (+ and -) allele fraction for power calculation in either direction
tumorSBpower_fwd = strandArtifactPowerCalculator.cachedPowerCalculation(forwardPRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD));
tumorSBpower_rev = strandArtifactPowerCalculator.cachedPowerCalculation(reversePRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD));
}
catch (Throwable t) {
System.err.println("Error processing " + activeRegionWindow.getContig() + ":" + loc);
t.printStackTrace(System.err);
throw new RuntimeException(t);
}
callVcb.attribute(GATKVCFConstants.TLOD_FWD_KEY, tumorLod_fwd);
callVcb.attribute(GATKVCFConstants.TLOD_REV_KEY, tumorLod_rev);
callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_FWD_KEY, tumorSBpower_fwd);
callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_REV_KEY, tumorSBpower_rev);
// TODO: add vcf INFO fields. see callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount);
if ((tumorSBpower_fwd > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_fwd < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD) ||
(tumorSBpower_rev > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_rev < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD))
callVcb.filter(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME);
}
// TODO: this probably belongs in M2::calculateFilters()
if (numPassingAlts > 1) {
callVcb.filter(GATKVCFConstants.TRIALLELIC_SITE_FILTER_NAME);
}
// build genotypes TODO: this part needs review and refactor
List<Allele> tumorAlleles = new ArrayList<>();
tumorAlleles.add(mergedVC.getReference());
tumorAlleles.add(alleleWithHighestTumorLOD);
Genotype tumorGenotype = new GenotypeBuilder(tumorSampleName, tumorAlleles)
.attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, altAlleleFractions.getAlt(alleleWithHighestTumorLOD))
.make(); // TODO: add ADs?
List<Genotype> genotypes = new ArrayList<>();
genotypes.add(tumorGenotype);
// We assume that the genotype in the normal is 0/0
// TODO: is normal always homozygous reference?
List<Allele> homRefAllelesforNormalGenotype = new ArrayList<>();
homRefAllelesforNormalGenotype.addAll(Collections.nCopies(2, mergedVC.getReference()));
// if we are calling with a normal, build the genotype for the sample to appear in vcf
int REF = 0, ALT = 1;
if (hasNormal) {
PerAlleleCollection<Integer> normalCounts = getRefAltCount(mergedVC, normalPRALM, false);
final int normalRefAlleleDepth = normalCounts.getRef();
final int normalAltAlleleDepth = normalCounts.getAlt(alleleWithHighestTumorLOD);
final int[] normalAlleleDepths = { normalRefAlleleDepth, normalAltAlleleDepth };
final double normalAlleleFraction = (double) normalAltAlleleDepth / ( normalRefAlleleDepth + normalAltAlleleDepth);
final Genotype normalGenotype = new GenotypeBuilder(matchedNormalSampleName, homRefAllelesforNormalGenotype)
.AD(normalAlleleDepths)
.attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, normalAlleleFraction)
.make();
genotypes.add(normalGenotype);
}
//only use alleles found in the tumor (
call = new VariantContextBuilder(callVcb).alleles(tumorAlleles).genotypes(genotypes).make();
}
// how should we be making use of _perSampleFilteredReadList_?
if( call != null ) {
readAlleleLikelihoods = prepareReadAlleleLikelihoodsForAnnotation(readLikelihoods, perSampleFilteredReadList,
genomeLocParser, emitReferenceConfidence, alleleMapper, readAlleleLikelihoods, call);
ReferenceContext referenceContext = new ReferenceContext(genomeLocParser, genomeLocParser.createGenomeLoc(mergedVC.getChr(), mergedVC.getStart(), mergedVC.getEnd()), refLoc, ref);
VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(referenceContext, tracker, readAlleleLikelihoods, call, false);
if( call.getAlleles().size() != mergedVC.getAlleles().size() )
annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall);
// maintain the set of all called haplotypes
for ( final Allele calledAllele : call.getAlleles() ) {
final List<Haplotype> haplotypeList = alleleMapper.get(calledAllele);
if (haplotypeList == null) continue;
calledHaplotypes.addAll(haplotypeList);
}
returnCalls.add( annotatedCall );
}
} }
final List<VariantContext> eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype);
if( eventsAtThisLoc.isEmpty() ) { continue; }
// Create the event mapping object which maps the original haplotype events to the events present at just this locus
final Map<Event, List<Haplotype>> eventMapper = createEventMapper(loc, eventsAtThisLoc, haplotypes);
final List<String> priorityList = makePriorityList(eventsAtThisLoc);
VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList,
GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
if( mergedVC == null ) { continue; }
final Map<VariantContext, Allele> mergeMap = new LinkedHashMap<>();
mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele
for(int iii = 0; iii < eventsAtThisLoc.size(); iii++) {
mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function
}
final Map<Allele, List<Haplotype>> alleleMapper = createAlleleMapper(mergeMap, eventMapper);
ReadLikelihoods<Allele> readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION));
//LDG: do we want to do this before or after pulling out overlapping reads?
if (MTAC.isSampleContaminationPresent())
readAlleleLikelihoods.contaminationDownsampling(MTAC.getSampleContamination());
final PerReadAlleleLikelihoodMap tumorPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(tumorSampleName));
filterPRALMForOverlappingReads(tumorPRALM, mergedVC.getReference(), loc, false);
MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in Tumor PRALM after filtering for overlapping reads");
// extend to multiple samples
// compute tumor LOD for each alternate allele
// TODO: somewhere we have to ensure that the all the alleles in the variant context is in alleleFractions passed to getHetGenotypeLogLikelihoods. getHetGenotypeLogLikelihoods will not check that for you
final PerAlleleCollection<Double> altAlleleFractions = estimateAlleleFraction(mergedVC, tumorPRALM, false);
final PerAlleleCollection<Double> tumorHetGenotypeLLs = getHetGenotypeLogLikelihoods(mergedVC, tumorPRALM, originalNormalReadQualities, altAlleleFractions);
final PerAlleleCollection<Double> tumorLods = PerAlleleCollection.createPerAltAlleleCollection();
for (final Allele altAllele : mergedVC.getAlternateAlleles()){
tumorLods.set(altAllele, tumorHetGenotypeLLs.get(altAllele) - tumorHetGenotypeLLs.getRef());
}
double INIT_NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE;
double NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE;
PerReadAlleleLikelihoodMap normalPRALM = null;
final PerAlleleCollection<Double> normalLods = PerAlleleCollection.createPerAltAlleleCollection();
// if normal bam is available, compute normal LOD
if (hasNormal) {
normalPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(matchedNormalSampleName));
filterPRALMForOverlappingReads(normalPRALM, mergedVC.getReference(), loc, true);
MuTect2.logReadInfo(DEBUG_READ_NAME, normalPRALM.getLikelihoodReadMap().keySet(), "Present after in Nomral PRALM filtering for overlapping reads");
final GenomeLoc eventGenomeLoc = genomeLocParser.createGenomeLoc(activeRegionWindow.getContig(), loc);
final Collection<VariantContext> cosmicVC = tracker.getValues(MTAC.cosmicRod, eventGenomeLoc);
final Collection<VariantContext> dbsnpVC = tracker.getValues(MTAC.dbsnp.dbsnp, eventGenomeLoc);
// remove the effect of cosmic from dbSNP
final boolean germlineAtRisk = (!dbsnpVC.isEmpty() && cosmicVC.isEmpty());
INIT_NORMAL_LOD_THRESHOLD = MTAC.INITIAL_NORMAL_LOD_THRESHOLD; //only set this if this job has a normal
NORMAL_LOD_THRESHOLD = (germlineAtRisk)?MTAC.NORMAL_DBSNP_LOD_THRESHOLD:MTAC.NORMAL_LOD_THRESHOLD;
// compute normal LOD = LL(X|REF)/LL(X|ALT) where ALT is the diploid HET with AF = 0.5
// note normal LOD is REF over ALT, the reciprocal of the tumor LOD
final PerAlleleCollection<Double> diploidHetAlleleFractions = PerAlleleCollection.createPerRefAndAltAlleleCollection();
for (final Allele allele : mergedVC.getAlternateAlleles()){
diploidHetAlleleFractions.setAlt(allele, 0.5);
}
final PerAlleleCollection<Double> normalGenotypeLLs = getHetGenotypeLogLikelihoods(mergedVC, normalPRALM, originalNormalReadQualities, diploidHetAlleleFractions);
for (final Allele altAllele : mergedVC.getAlternateAlleles()){
normalLods.setAlt(altAllele, normalGenotypeLLs.getRef() - normalGenotypeLLs.getAlt(altAllele));
}
}
int numPassingAlts = 0;
final Set<Allele> allelesThatPassThreshold = new HashSet<>();
Allele alleleWithHighestTumorLOD = null;
// TODO: use lambda
for (final Allele altAllele : mergedVC.getAlternateAlleles()) {
final boolean passesTumorLodThreshold = tumorLods.getAlt(altAllele) >= MTAC.INITIAL_TUMOR_LOD_THRESHOLD;
final boolean passesNormalLodThreshold = hasNormal ? normalLods.getAlt(altAllele) >= INIT_NORMAL_LOD_THRESHOLD : true;
if (passesTumorLodThreshold && passesNormalLodThreshold) {
numPassingAlts++;
allelesThatPassThreshold.add(altAllele);
if (alleleWithHighestTumorLOD == null
|| tumorLods.getAlt(altAllele) > tumorLods.getAlt(alleleWithHighestTumorLOD)){
alleleWithHighestTumorLOD = altAllele;
}
}
}
if (numPassingAlts == 0) {
continue;
}
final VariantContextBuilder callVcb = new VariantContextBuilder(mergedVC);
// FIXME: can simply get first alternate since above we only deal with Bi-allelic sites...
final int haplotypeCount = alleleMapper.get(mergedVC.getAlternateAllele(0)).size();
callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount);
callVcb.attribute(GATKVCFConstants.TUMOR_LOD_KEY, tumorLods.getAlt(alleleWithHighestTumorLOD));
if (hasNormal) {
callVcb.attribute(GATKVCFConstants.NORMAL_LOD_KEY, normalLods.getAlt(alleleWithHighestTumorLOD));
if (normalLods.getAlt(alleleWithHighestTumorLOD) < NORMAL_LOD_THRESHOLD) {
callVcb.filter(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME);
}
}
// M1-style strand artifact filter
// TODO: move code to MuTect2::calculateFilters()
// skip if VC has multiple alleles - it will get filtered later anyway
if (MTAC.ENABLE_STRAND_ARTIFACT_FILTER && numPassingAlts == 1) {
final PerReadAlleleLikelihoodMap forwardPRALM = new PerReadAlleleLikelihoodMap();
final PerReadAlleleLikelihoodMap reversePRALM = new PerReadAlleleLikelihoodMap();
splitPRALMintoForwardAndReverseReads(tumorPRALM, forwardPRALM, reversePRALM);
MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in tumor PRALM after PRALM is split");
MuTect2.logReadInfo(DEBUG_READ_NAME, forwardPRALM.getLikelihoodReadMap().keySet(), "Present in forward PRALM after PRALM is split");
MuTect2.logReadInfo(DEBUG_READ_NAME, reversePRALM.getLikelihoodReadMap().keySet(), "Present in reverse PRALM after PRALM is split");
// TODO: build a new type for probability, likelihood, and log_likelihood. e.g. f_fwd :: probability[], tumorGLs_fwd :: likelihood[]
// TODO: don't want to call getHetGenotypeLogLikelihoods on more than one alternate alelle. May need to overload it to take a scalar f_fwd.
final PerAlleleCollection<Double> alleleFractionsForward = estimateAlleleFraction(mergedVC, forwardPRALM, true);
final PerAlleleCollection<Double> tumorGenotypeLLForward = getHetGenotypeLogLikelihoods(mergedVC, forwardPRALM, originalNormalReadQualities, alleleFractionsForward);
final PerAlleleCollection<Double> alleleFractionsReverse = estimateAlleleFraction(mergedVC, reversePRALM, true);
final PerAlleleCollection<Double> tumorGenotypeLLReverse = getHetGenotypeLogLikelihoods(mergedVC, reversePRALM, originalNormalReadQualities, alleleFractionsReverse);
final double tumorLod_fwd = tumorGenotypeLLForward.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLForward.getRef();
final double tumorLod_rev = tumorGenotypeLLReverse.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLReverse.getRef();
// Note that we use the observed combined (+ and -) allele fraction for power calculation in either direction
final double tumorSBpower_fwd = strandArtifactPowerCalculator.cachedPowerCalculation(forwardPRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD));
final double tumorSBpower_rev = strandArtifactPowerCalculator.cachedPowerCalculation(reversePRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD));
callVcb.attribute(GATKVCFConstants.TLOD_FWD_KEY, tumorLod_fwd);
callVcb.attribute(GATKVCFConstants.TLOD_REV_KEY, tumorLod_rev);
callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_FWD_KEY, tumorSBpower_fwd);
callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_REV_KEY, tumorSBpower_rev);
// TODO: add vcf INFO fields. see callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount);
if ((tumorSBpower_fwd > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_fwd < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD) ||
(tumorSBpower_rev > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_rev < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD))
callVcb.filter(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME);
}
// TODO: this probably belongs in M2::calculateFilters()
if (numPassingAlts > 1) {
callVcb.filter(GATKVCFConstants.TRIALLELIC_SITE_FILTER_NAME);
}
// build genotypes TODO: this part needs review and refactor
final List<Allele> tumorAlleles = Arrays.asList(mergedVC.getReference(), alleleWithHighestTumorLOD);
final Genotype tumorGenotype = new GenotypeBuilder(tumorSampleName, tumorAlleles)
.attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, altAlleleFractions.getAlt(alleleWithHighestTumorLOD))
.make(); // TODO: add ADs?
final List<Genotype> genotypes = new ArrayList<>();
genotypes.add(tumorGenotype);
// We assume that the genotype in the normal is 0/0
// TODO: is normal always homozygous reference?
final List<Allele> homRefAllelesforNormalGenotype = Collections.nCopies(2, mergedVC.getReference());
// if we are calling with a normal, build the genotype for the sample to appear in vcf
if (hasNormal) {
final PerAlleleCollection<Integer> normalCounts = getRefAltCount(mergedVC, normalPRALM, false);
final int normalRefAlleleDepth = normalCounts.getRef();
final int normalAltAlleleDepth = normalCounts.getAlt(alleleWithHighestTumorLOD);
final int[] normalAlleleDepths = { normalRefAlleleDepth, normalAltAlleleDepth };
final double normalAlleleFraction = (double) normalAltAlleleDepth / ( normalRefAlleleDepth + normalAltAlleleDepth);
final Genotype normalGenotype = new GenotypeBuilder(matchedNormalSampleName, homRefAllelesforNormalGenotype)
.AD(normalAlleleDepths)
.attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, normalAlleleFraction)
.make();
genotypes.add(normalGenotype);
}
//only use alleles found in the tumor (
final VariantContext call = new VariantContextBuilder(callVcb).alleles(tumorAlleles).genotypes(genotypes).make();
// how should we be making use of _perSampleFilteredReadList_?
readAlleleLikelihoods = prepareReadAlleleLikelihoodsForAnnotation(readLikelihoods, perSampleFilteredReadList,
genomeLocParser, false, alleleMapper, readAlleleLikelihoods, call);
final ReferenceContext referenceContext = new ReferenceContext(genomeLocParser, genomeLocParser.createGenomeLoc(mergedVC.getChr(), mergedVC.getStart(), mergedVC.getEnd()), refLoc, ref);
VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(referenceContext, tracker, readAlleleLikelihoods, call, false);
if( call.getAlleles().size() != mergedVC.getAlleles().size() )
annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall);
// maintain the set of all called haplotypes
call.getAlleles().stream().map(alleleMapper::get).filter(Objects::nonNull).forEach(calledHaplotypes::addAll);
returnCalls.add( annotatedCall );
} }
// TODO: understand effect of enabling this for somatic calling... // TODO: understand effect of enabling this for somatic calling...
final List<VariantContext> phasedCalls = doPhysicalPhasing ? phaseCalls(returnCalls, calledHaplotypes) : returnCalls; final List<VariantContext> outputCalls = doPhysicalPhasing ? phaseCalls(returnCalls, calledHaplotypes) : returnCalls;
return new CalledHaplotypes(phasedCalls, calledHaplotypes); return new CalledHaplotypes(outputCalls, calledHaplotypes);
//return new CalledHaplotypes(returnCalls, calledHaplotypes);
} }
private void verifySamplePresence(String sampleName, List<String> samples) { private void verifySamplePresence(final String sampleName, final List<String> samples) {
if (!samples.contains(sampleName)) { if (!samples.contains(sampleName)) {
throw new IllegalArgumentException("Unable to find sample name "+sampleName+"in sample list of " + StringUtil.join(",", samples)); throw new IllegalArgumentException("Unable to find sample name "+sampleName+"in sample list of " + StringUtil.join(",", samples));
} }
@ -459,7 +385,7 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
* @param alleleFractions allele fraction(s) for alternate allele(s) * @param alleleFractions allele fraction(s) for alternate allele(s)
* *
* @return genotype likelihoods for homRef and het for each alternate allele * @return genotype likelihoods for homRef and het for each alternate allele
*/ */
private PerAlleleCollection<Double> getHetGenotypeLogLikelihoods(final VariantContext mergedVC, private PerAlleleCollection<Double> getHetGenotypeLogLikelihoods(final VariantContext mergedVC,
final PerReadAlleleLikelihoodMap tumorPRALM, final PerReadAlleleLikelihoodMap tumorPRALM,
final Map<String, Integer> originalNormalMQs, final Map<String, Integer> originalNormalMQs,
@ -470,13 +396,11 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
} }
final PerAlleleCollection<MutableDouble> genotypeLogLikelihoods = PerAlleleCollection.createPerRefAndAltAlleleCollection(); final PerAlleleCollection<MutableDouble> genotypeLogLikelihoods = PerAlleleCollection.createPerRefAndAltAlleleCollection();
for (final Allele allele : mergedVC.getAlleles()){ mergedVC.getAlleles().forEach(a -> genotypeLogLikelihoods.set(a, new MutableDouble(0)));
genotypeLogLikelihoods.set(allele, new MutableDouble(0.0));
}
final Allele refAllele = mergedVC.getReference(); final Allele refAllele = mergedVC.getReference();
for(Map.Entry<GATKSAMRecord,Map<Allele, Double>> readAlleleLikelihoodMap : tumorPRALM.getLikelihoodReadMap().entrySet()) { for(Map.Entry<GATKSAMRecord,Map<Allele, Double>> readAlleleLikelihoodMap : tumorPRALM.getLikelihoodReadMap().entrySet()) {
Map<Allele, Double> alleleLikelihoodMap = readAlleleLikelihoodMap.getValue(); final Map<Allele, Double> alleleLikelihoodMap = readAlleleLikelihoodMap.getValue();
if (originalNormalMQs.get(readAlleleLikelihoodMap.getKey().getReadName()) == 0) { if (originalNormalMQs.get(readAlleleLikelihoodMap.getKey().getReadName()) == 0) {
continue; continue;
} }
@ -484,9 +408,9 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
final double readRefLogLikelihood = alleleLikelihoodMap.get(refAllele); final double readRefLogLikelihood = alleleLikelihoodMap.get(refAllele);
genotypeLogLikelihoods.getRef().add(readRefLogLikelihood); genotypeLogLikelihoods.getRef().add(readRefLogLikelihood);
for (Allele altAllele : alleleFractions.getAltAlleles()) { for (final Allele altAllele : alleleFractions.getAltAlleles()) {
double readAltLogLikelihood = alleleLikelihoodMap.get(altAllele); final double readAltLogLikelihood = alleleLikelihoodMap.get(altAllele);
double adjustedReadAltLL = Math.log10( final double adjustedReadAltLL = Math.log10(
Math.pow(10, readRefLogLikelihood) * (1 - alleleFractions.getAlt(altAllele)) + Math.pow(10, readRefLogLikelihood) * (1 - alleleFractions.getAlt(altAllele)) +
Math.pow(10, readAltLogLikelihood) * alleleFractions.getAlt(altAllele) Math.pow(10, readAltLogLikelihood) * alleleFractions.getAlt(altAllele)
); );
@ -515,9 +439,9 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
final PerAlleleCollection<Integer> alleleCounts = getRefAltCount(vc, pralm, oneStrandOnly); final PerAlleleCollection<Integer> alleleCounts = getRefAltCount(vc, pralm, oneStrandOnly);
final PerAlleleCollection<Double> alleleFractions = PerAlleleCollection.createPerAltAlleleCollection(); final PerAlleleCollection<Double> alleleFractions = PerAlleleCollection.createPerAltAlleleCollection();
int refCount = alleleCounts.getRef(); final int refCount = alleleCounts.getRef();
for ( final Allele altAllele : vc.getAlternateAlleles() ) { for ( final Allele altAllele : vc.getAlternateAlleles() ) {
int altCount = alleleCounts.getAlt(altAllele); final int altCount = alleleCounts.getAlt(altAllele);
double alleleFraction = (double) altCount / (refCount + altCount); double alleleFraction = (double) altCount / (refCount + altCount);
// weird case, but I've seen it happen in one strand cases // weird case, but I've seen it happen in one strand cases
if (refCount == 0 && altCount == refCount ) { if (refCount == 0 && altCount == refCount ) {
@ -552,16 +476,12 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
final PerAlleleCollection<MutableInt> alleleCounts = PerAlleleCollection.createPerRefAndAltAlleleCollection(); final PerAlleleCollection<MutableInt> alleleCounts = PerAlleleCollection.createPerRefAndAltAlleleCollection();
vcAlleles.stream().forEach(a -> alleleCounts.set(a, new MutableInt(0)));
// initialize the allele counts to 0
for (final Allele allele : vcAlleles) {
alleleCounts.set(allele, new MutableInt(0));
}
for (final Map.Entry<GATKSAMRecord, Map<Allele, Double>> readAlleleLikelihoodMap : pralm.getLikelihoodReadMap().entrySet()) { for (final Map.Entry<GATKSAMRecord, Map<Allele, Double>> readAlleleLikelihoodMap : pralm.getLikelihoodReadMap().entrySet()) {
final GATKSAMRecord read = readAlleleLikelihoodMap.getKey(); final GATKSAMRecord read = readAlleleLikelihoodMap.getKey();
final Map<Allele, Double> alleleLikelihoodMap = readAlleleLikelihoodMap.getValue(); final Map<Allele, Double> alleleLikelihoodMap = readAlleleLikelihoodMap.getValue();
MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(alleleLikelihoodMap, vcAlleles); final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(alleleLikelihoodMap, vcAlleles);
if (read.getMappingQuality() > 0 && mostLikelyAllele.isInformative()) { if (read.getMappingQuality() > 0 && mostLikelyAllele.isInformative()) {
alleleCounts.get(mostLikelyAllele.getMostLikelyAllele()).increment(); alleleCounts.get(mostLikelyAllele.getMostLikelyAllele()).increment();
@ -581,18 +501,16 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
} }
} }
private void filterPRALMForOverlappingReads(PerReadAlleleLikelihoodMap pralm, Allele ref, int location, boolean retainMismatches) { private void filterPRALMForOverlappingReads(final PerReadAlleleLikelihoodMap pralm, final Allele ref, final int location, final boolean retainMismatches) {
final Map<GATKSAMRecord, Map<Allele, Double>> m = pralm.getLikelihoodReadMap();
Map<GATKSAMRecord, Map<Allele, Double>> m = pralm.getLikelihoodReadMap();
// iterate through the reads, if the name has been seen before we have overlapping (potentially) fragments, so handle them // iterate through the reads, if the name has been seen before we have overlapping (potentially) fragments, so handle them
Map<String, GATKSAMRecord> nameToRead = new HashMap<>(); final Map<String, GATKSAMRecord> nameToRead = new HashMap<>();
Set<GATKSAMRecord> readsToKeep = new HashSet<>(); final Set<GATKSAMRecord> readsToKeep = new HashSet<>();
for(GATKSAMRecord rec : m.keySet()) { for(final GATKSAMRecord rec : m.keySet()) {
// if we haven't seen it... just record the name and add it to the list of reads to keep // if we haven't seen it... just record the name and add it to the list of reads to keep
GATKSAMRecord existing = nameToRead.get(rec.getReadName()); final GATKSAMRecord existing = nameToRead.get(rec.getReadName());
if (existing == null) { if (existing == null) {
nameToRead.put(rec.getReadName(), rec); nameToRead.put(rec.getReadName(), rec);
readsToKeep.add(rec); readsToKeep.add(rec);
@ -604,11 +522,11 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
// TODO: CHECK IF THE READS BOTH OVERLAP THE POSITION!!!! // TODO: CHECK IF THE READS BOTH OVERLAP THE POSITION!!!!
if ( ReadUtils.isInsideRead(existing, location) && ReadUtils.isInsideRead(rec, location) ) { if ( ReadUtils.isInsideRead(existing, location) && ReadUtils.isInsideRead(rec, location) ) {
MostLikelyAllele existingMLA = pralm.getMostLikelyAllele(pralm.getLikelihoodReadMap().get(existing)); final MostLikelyAllele existingMLA = PerReadAlleleLikelihoodMap.getMostLikelyAllele(pralm.getLikelihoodReadMap().get(existing));
Allele existingAllele = existingMLA.getMostLikelyAllele(); final Allele existingAllele = existingMLA.getMostLikelyAllele();
MostLikelyAllele recMLA = pralm.getMostLikelyAllele(pralm.getLikelihoodReadMap().get(rec)); final MostLikelyAllele recMLA = PerReadAlleleLikelihoodMap.getMostLikelyAllele(pralm.getLikelihoodReadMap().get(rec));
Allele recAllele = recMLA.getMostLikelyAllele(); final Allele recAllele = recMLA.getMostLikelyAllele();
// if the reads disagree at this position... // if the reads disagree at this position...
if (!existingAllele.equals(recAllele)) { if (!existingAllele.equals(recAllele)) {
@ -654,7 +572,7 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
} }
private void splitPRALMintoForwardAndReverseReads(final PerReadAlleleLikelihoodMap originalPRALM, final PerReadAlleleLikelihoodMap forwardPRALM, final PerReadAlleleLikelihoodMap reversePRALM) { private void splitPRALMintoForwardAndReverseReads(final PerReadAlleleLikelihoodMap originalPRALM, final PerReadAlleleLikelihoodMap forwardPRALM, final PerReadAlleleLikelihoodMap reversePRALM) {
Map<GATKSAMRecord, Map<Allele, Double>> origReadAlleleLikelihoodMap = originalPRALM.getLikelihoodReadMap(); final Map<GATKSAMRecord, Map<Allele, Double>> origReadAlleleLikelihoodMap = originalPRALM.getLikelihoodReadMap();
for (final GATKSAMRecord read : origReadAlleleLikelihoodMap.keySet()) { for (final GATKSAMRecord read : origReadAlleleLikelihoodMap.keySet()) {
if (read.isStrandless()) if (read.isStrandless())
continue; continue;
@ -669,17 +587,4 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
} }
} }
} }
// Move to utility class so we can use one shared with HaplotypeCallerGenotypingEngine
private VariantContext addNonRefSymbolicAllele(final VariantContext mergedVC) {
final VariantContextBuilder vcb = new VariantContextBuilder(mergedVC);
final List<Allele> originalList = mergedVC.getAlleles();
final List<Allele> alleleList = new ArrayList<>(originalList.size() + 1);
alleleList.addAll(mergedVC.getAlleles());
alleleList.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
vcb.alleles(alleleList);
return vcb.make();
}
} }

View File

@ -55,6 +55,7 @@ import org.apache.commons.math.MathException;
import org.apache.commons.math.distribution.BinomialDistribution; import org.apache.commons.math.distribution.BinomialDistribution;
import org.apache.commons.math.distribution.BinomialDistributionImpl; import org.apache.commons.math.distribution.BinomialDistributionImpl;
import org.apache.commons.math3.util.Pair; import org.apache.commons.math3.util.Pair;
import org.broadinstitute.gatk.utils.exceptions.GATKException;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
@ -70,7 +71,6 @@ public class TumorPowerCalculator {
private final double tumorLODThreshold; private final double tumorLODThreshold;
private final double contamination; private final double contamination;
private final boolean enableSmoothing; private final boolean enableSmoothing;
public static int numCacheHits = 0;
private final HashMap<PowerCacheKey, Double> cache = new HashMap<PowerCacheKey, Double>(); private final HashMap<PowerCacheKey, Double> cache = new HashMap<PowerCacheKey, Double>();
@ -134,16 +134,18 @@ public class TumorPowerCalculator {
* @throws MathException * @throws MathException
* *
*/ */
public double cachedPowerCalculation(final int numReads, final double alleleFraction) throws MathException { public double cachedPowerCalculation(final int numReads, final double alleleFraction) {
PowerCacheKey key = new PowerCacheKey(numReads, alleleFraction); PowerCacheKey key = new PowerCacheKey(numReads, alleleFraction);
// we first look up if power for given number of read and allele fraction has already been computed and stored in the cache. // we first look up if power for given number of read and allele fraction has already been computed and stored in the cache.
// if not we compute it and store it in teh cache. // if not we compute it and store it in teh cache.
Double power = cache.get(key); Double power = cache.get(key);
if (power == null) { if (power == null) {
power = calculatePower(numReads, alleleFraction); try {
power = calculatePower(numReads, alleleFraction);
} catch (final Exception ex) {
throw new GATKException("Power calculation failed", ex);
}
cache.put(key, power); cache.put(key, power);
} else {
numCacheHits++;
} }
return power; return power;
} }