Merge pull request #1453 from broadinstitute/db_mutect2
Lots of small improvements to Mutect2 code
This commit is contained in:
commit
4aede99697
|
|
@ -51,12 +51,41 @@
|
|||
|
||||
package org.broadinstitute.gatk.tools.walkers.cancer.m2;
|
||||
|
||||
import htsjdk.variant.variantcontext.VariantContext;
|
||||
import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection;
|
||||
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.AssemblyBasedCallerArgumentCollection;
|
||||
import org.broadinstitute.gatk.utils.commandline.Advanced;
|
||||
import org.broadinstitute.gatk.utils.commandline.Argument;
|
||||
import org.broadinstitute.gatk.utils.commandline.Hidden;
|
||||
import org.broadinstitute.gatk.utils.commandline.*;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection {
|
||||
|
||||
/***************************************/
|
||||
// Reference Metadata inputs
|
||||
/***************************************/
|
||||
/**
|
||||
* MuTect2 has the ability to use COSMIC data in conjunction with dbSNP to adjust the threshold for evidence of a variant
|
||||
* in the normal. If a variant is present in dbSNP, but not in COSMIC, then more evidence is required from the normal
|
||||
* sample to prove the variant is not present in germline.
|
||||
*/
|
||||
@Input(fullName="cosmic", shortName = "cosmic", doc="VCF file of COSMIC sites", required=false)
|
||||
public List<RodBinding<VariantContext>> cosmicRod = Collections.emptyList();
|
||||
|
||||
/**
|
||||
* A panel of normals can be a useful (optional) input to help filter out commonly seen sequencing noise that may appear as low allele-fraction somatic variants.
|
||||
*/
|
||||
@Input(fullName="normal_panel", shortName = "PON", doc="VCF file of sites observed in normal", required=false)
|
||||
public List<RodBinding<VariantContext>> normalPanelRod = Collections.emptyList();
|
||||
|
||||
|
||||
/**
|
||||
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
|
||||
* dbSNP overlap is only used to require more evidence of absence in the normal if the variant in question has been seen before in germline.
|
||||
*/
|
||||
@ArgumentCollection
|
||||
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="m2debug", shortName="m2debug", doc="Print out very verbose M2 debug information", required = false)
|
||||
public boolean M2_DEBUG = false;
|
||||
|
|
|
|||
|
|
@ -201,7 +201,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
// the genotyping engine
|
||||
protected HaplotypeCallerGenotypingEngine genotypingEngine = null;
|
||||
|
||||
|
||||
private byte MIN_TAIL_QUALITY;
|
||||
private double log10GlobalReadMismappingRate;
|
||||
|
||||
|
|
@ -214,9 +213,8 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
@ArgumentCollection
|
||||
protected LikelihoodEngineArgumentCollection LEAC = new LikelihoodEngineArgumentCollection();
|
||||
|
||||
|
||||
@Argument(fullName = "debug_read_name", required = false, doc="trace this read name through the calling process")
|
||||
public String DEBUG_READ_NAME = null;
|
||||
protected String DEBUG_READ_NAME = null;
|
||||
|
||||
@Hidden
|
||||
@Advanced
|
||||
|
|
@ -224,24 +222,81 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
final public int MQthreshold = 20;
|
||||
|
||||
|
||||
/***************************************/
|
||||
// Reference Metadata inputs
|
||||
/***************************************/
|
||||
/**
|
||||
* MuTect2 has the ability to use COSMIC data in conjunction with dbSNP to adjust the threshold for evidence of a variant
|
||||
* in the normal. If a variant is present in dbSNP, but not in COSMIC, then more evidence is required from the normal
|
||||
* sample to prove the variant is not present in germline.
|
||||
*/
|
||||
@Input(fullName="cosmic", shortName = "cosmic", doc="VCF file of COSMIC sites", required=false)
|
||||
public List<RodBinding<VariantContext>> cosmicRod = Collections.emptyList();
|
||||
|
||||
/**
|
||||
* A panel of normals can be a useful (optional) input to help filter out commonly seen sequencing noise that may appear as low allele-fraction somatic variants.
|
||||
*/
|
||||
@Input(fullName="normal_panel", shortName = "PON", doc="VCF file of sites observed in normal", required=false)
|
||||
public List<RodBinding<VariantContext>> normalPanelRod = Collections.emptyList();
|
||||
public RodBinding<VariantContext> getDbsnpRodBinding() { return MTAC.dbsnp.dbsnp; }
|
||||
|
||||
private HaplotypeBAMWriter haplotypeBAMWriter;
|
||||
/**
|
||||
* If a call overlaps with a record from the provided comp track, the INFO field will be annotated
|
||||
* as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field).
|
||||
* Records that are filtered in the comp track will be ignored.
|
||||
* Note that 'dbSNP' has been special-cased (see the --dbsnp argument).
|
||||
*/
|
||||
@Advanced
|
||||
@Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false)
|
||||
public List<RodBinding<VariantContext>> comps = Collections.emptyList();
|
||||
public List<RodBinding<VariantContext>> getCompRodBindings() { return comps; }
|
||||
|
||||
/**
|
||||
* Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
|
||||
protected List<String> annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"DepthPerAlleleBySample", "BaseQualitySumPerAlleleBySample", "TandemRepeatAnnotator", "OxoGReadCounts"}));
|
||||
|
||||
/**
|
||||
* Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments,
|
||||
* so annotations will be excluded even if they are explicitly included with the other options.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false)
|
||||
protected List<String> annotationsToExclude = new ArrayList<>(Arrays.asList(new String[]{"SpanningDeletions"}));
|
||||
|
||||
/**
|
||||
* Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups.
|
||||
*/
|
||||
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
|
||||
protected String[] annotationClassesToUse = { };
|
||||
|
||||
@Output(doc="File to which variants should be written")
|
||||
protected VariantContextWriter vcfWriter = null;
|
||||
|
||||
/**
|
||||
* Active region trimmer reference.
|
||||
*/
|
||||
@ArgumentCollection
|
||||
protected ActiveRegionTrimmer trimmer = new ActiveRegionTrimmer();
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false)
|
||||
protected String keepRG = null;
|
||||
|
||||
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false)
|
||||
public byte MIN_BASE_QUALTY_SCORE = 10;
|
||||
|
||||
public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.HOSTILE;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="errorCorrectReads", shortName="errorCorrectReads", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false)
|
||||
protected boolean errorCorrectReads = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="captureAssemblyFailureBAM", shortName="captureAssemblyFailureBAM", doc="If specified, we will write a BAM called assemblyFailure.bam capturing all of the reads that were in the active region when the assembler failed for any reason", required = false)
|
||||
protected boolean captureAssemblyFailureBAM = false;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="If specified, we will not analyze soft clipped bases in the reads", required = false)
|
||||
protected boolean dontUseSoftClippedBases = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false)
|
||||
protected boolean justDetermineActiveRegions = false;
|
||||
|
||||
// reference base padding size
|
||||
private static final int REFERENCE_PADDING = 500;
|
||||
|
||||
private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6;
|
||||
private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument
|
||||
private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
|
|
@ -250,15 +305,15 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
samplesList = new IndexedSampleList(new ArrayList<>(ReadUtils.getSAMFileSamples(getToolkit().getSAMFileHeader())));
|
||||
|
||||
// MUTECT: check that we have at least one tumor bam
|
||||
for(SAMReaderID id : getToolkit().getReadsDataSource().getReaderIDs()) {
|
||||
for(final SAMReaderID id : getToolkit().getReadsDataSource().getReaderIDs()) {
|
||||
if (id.getTags().getPositionalTags().size() == 0) {
|
||||
throw new RuntimeException("BAMs must be tagged as either 'tumor' or 'normal'");
|
||||
}
|
||||
|
||||
// only supports single-sample BAMs (ie first read group is representative)
|
||||
String bamSampleName = getToolkit().getReadsDataSource().getHeader(id).getReadGroups().get(0).getSample();
|
||||
final String bamSampleName = getToolkit().getReadsDataSource().getHeader(id).getReadGroups().get(0).getSample();
|
||||
|
||||
for(String tag : id.getTags().getPositionalTags()) {
|
||||
for(final String tag : id.getTags().getPositionalTags()) {
|
||||
if (BAM_TAG_TUMOR.equalsIgnoreCase(tag)) {
|
||||
tumorSAMReaderIDs.add(id);
|
||||
if (tumorSampleName == null) {
|
||||
|
|
@ -330,9 +385,9 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
final MergeVariantsAcrossHaplotypes variantMerger = new MergeVariantsAcrossHaplotypes();
|
||||
|
||||
final GenomeAnalysisEngine toolkit = getToolkit();
|
||||
final GenomeLocParser genomeLocParser = toolkit.getGenomeLocParser();
|
||||
|
||||
genotypingEngine = new SomaticGenotypingEngine( MTAC, samplesList, genomeLocParser, FixedAFCalculatorProvider.createThreadSafeProvider(getToolkit(), MTAC, logger), !doNotRunPhysicalPhasing, MTAC);
|
||||
genotypingEngine = new SomaticGenotypingEngine( MTAC, samplesList, toolkit.getGenomeLocParser(), !doNotRunPhysicalPhasing, MTAC,
|
||||
tumorSampleName, normalSampleName, DEBUG_READ_NAME);
|
||||
|
||||
genotypingEngine.setCrossHaplotypeEventMerger(variantMerger);
|
||||
genotypingEngine.setAnnotationEngine(annotationEngine);
|
||||
|
|
@ -353,7 +408,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
trimmer.snpPadding = 50;
|
||||
|
||||
samplesList = toolkit.getReadSampleList();
|
||||
Set<String> sampleSet = SampleListUtils.asSet(samplesList);
|
||||
final Set<String> sampleSet = SampleListUtils.asSet(samplesList);
|
||||
|
||||
if( MTAC.CONTAMINATION_FRACTION_FILE != null )
|
||||
MTAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(MTAC.CONTAMINATION_FRACTION_FILE, MTAC.CONTAMINATION_FRACTION, sampleSet, logger));
|
||||
|
|
@ -367,7 +422,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
}
|
||||
final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
|
||||
|
||||
Set<VCFHeaderLine> headerInfo = new HashSet<>();
|
||||
final Set<VCFHeaderLine> headerInfo = new HashSet<>();
|
||||
|
||||
// all annotation fields from VariantAnnotatorEngine
|
||||
headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions());
|
||||
|
|
@ -383,7 +438,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
headerInfo.addAll(getM2HeaderLines());
|
||||
headerInfo.addAll(getSampleHeaderLines());
|
||||
|
||||
List<String> outputSampleNames = getOutputSampleNames();
|
||||
final List<String> outputSampleNames = getOutputSampleNames();
|
||||
|
||||
vcfWriter.writeHeader(new VCFHeader(headerInfo, outputSampleNames));
|
||||
|
||||
|
|
@ -391,7 +446,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
}
|
||||
|
||||
private Set<VCFHeaderLine> getM2HeaderLines(){
|
||||
Set<VCFHeaderLine> headerInfo = new HashSet<>();
|
||||
final Set<VCFHeaderLine> headerInfo = new HashSet<>();
|
||||
headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.NORMAL_LOD_KEY));
|
||||
headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.TUMOR_LOD_KEY));
|
||||
headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.PANEL_OF_NORMALS_COUNT_KEY));
|
||||
|
|
@ -430,21 +485,21 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
}
|
||||
|
||||
private Set<VCFHeaderLine> getSampleHeaderLines(){
|
||||
Set<VCFHeaderLine> sampleLines = new HashSet<>();
|
||||
final Set<VCFHeaderLine> sampleLines = new HashSet<>();
|
||||
if (printTCGAsampleHeader) {
|
||||
//NOTE: This will only list the first bam file for each tumor/normal sample if there is more than one
|
||||
Map<String, String> normalSampleHeaderAttributes = new HashMap<>();
|
||||
final Map<String, String> normalSampleHeaderAttributes = new HashMap<>();
|
||||
normalSampleHeaderAttributes.put("ID", "NORMAL");
|
||||
normalSampleHeaderAttributes.put("SampleName", normalSampleName);
|
||||
if (normalSAMReaderIDs.iterator().hasNext() && !getToolkit().getArguments().disableCommandLineInVCF)
|
||||
normalSampleHeaderAttributes.put("File", normalSAMReaderIDs.iterator().next().getSamFilePath());
|
||||
VCFSimpleHeaderLine normalSampleHeader = new VCFSimpleHeaderLine("SAMPLE", normalSampleHeaderAttributes);
|
||||
Map<String, String> tumorSampleHeaderAttributes = new HashMap<>();
|
||||
final VCFSimpleHeaderLine normalSampleHeader = new VCFSimpleHeaderLine("SAMPLE", normalSampleHeaderAttributes);
|
||||
final Map<String, String> tumorSampleHeaderAttributes = new HashMap<>();
|
||||
tumorSampleHeaderAttributes.put("ID", "TUMOR");
|
||||
tumorSampleHeaderAttributes.put("SampleName", tumorSampleName);
|
||||
if (tumorSAMReaderIDs.iterator().hasNext() && !getToolkit().getArguments().disableCommandLineInVCF)
|
||||
tumorSampleHeaderAttributes.put("File", tumorSAMReaderIDs.iterator().next().getSamFilePath());
|
||||
VCFSimpleHeaderLine tumorSampleHeader = new VCFSimpleHeaderLine("SAMPLE", tumorSampleHeaderAttributes);
|
||||
final VCFSimpleHeaderLine tumorSampleHeader = new VCFSimpleHeaderLine("SAMPLE", tumorSampleHeaderAttributes);
|
||||
|
||||
sampleLines.add(normalSampleHeader);
|
||||
sampleLines.add(tumorSampleHeader);
|
||||
|
|
@ -455,7 +510,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
private List<String> getOutputSampleNames(){
|
||||
if (printTCGAsampleHeader) {
|
||||
//Already checked for exactly 1 tumor and 1 normal in printTCGAsampleHeader assignment in initialize()
|
||||
List<String> sampleNamePlaceholders = new ArrayList<>(2);
|
||||
final List<String> sampleNamePlaceholders = new ArrayList<>(2);
|
||||
sampleNamePlaceholders.add("TUMOR");
|
||||
sampleNamePlaceholders.add("NORMAL");
|
||||
return sampleNamePlaceholders;
|
||||
|
|
@ -466,14 +521,14 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
}
|
||||
|
||||
@Override
|
||||
public ActivityProfileState isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
public ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
|
||||
if( context == null || context.getBasePileup().isEmpty() )
|
||||
// if we don't have any data, just abort early
|
||||
return new ActivityProfileState(ref.getLocus(), 0.0);
|
||||
|
||||
final Map<String, AlignmentContext> splitContexts = AlignmentContextUtils.splitContextBySampleName(context);
|
||||
AlignmentContext tumorContext = splitContexts.get(tumorSampleName);
|
||||
AlignmentContext normalContext = splitContexts.get(normalSampleName);
|
||||
final AlignmentContext tumorContext = splitContexts.get(tumorSampleName);
|
||||
final AlignmentContext normalContext = splitContexts.get(normalSampleName);
|
||||
|
||||
// if there are no tumor reads... there is no activity!
|
||||
if (tumorContext == null) {
|
||||
|
|
@ -481,7 +536,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
}
|
||||
|
||||
// KCIBUL -- this method was inlined and modified from ReferenceConfidenceModel
|
||||
ReadBackedPileup tumorPileup = tumorContext.getBasePileup().getMappingFilteredPileup(MQthreshold);
|
||||
final ReadBackedPileup tumorPileup = tumorContext.getBasePileup().getMappingFilteredPileup(MQthreshold);
|
||||
final double[] tumorGLs = calcGenotypeLikelihoodsOfRefVsAny(tumorPileup, ref.getBase(), MIN_BASE_QUALTY_SCORE);
|
||||
final double tumorLod = tumorGLs[1] - tumorGLs[0];
|
||||
|
||||
|
|
@ -495,7 +550,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
// in any case, we have to handle the case where there is no normal (and thus no normal context) which is
|
||||
// different than having a normal but having no reads (where we should not enter the active region)
|
||||
if (normalSampleName != null && normalContext != null) {
|
||||
int nonRefInNormal = getCountOfNonRefEvents(normalContext.getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE);
|
||||
final int nonRefInNormal = getCountOfNonRefEvents(normalContext.getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE);
|
||||
|
||||
final double[] normalGLs = calcGenotypeLikelihoodsOfRefVsAny(normalContext.getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE, 0.5f);
|
||||
final double normalLod = normalGLs[0] - normalGLs[1];
|
||||
|
|
@ -528,13 +583,12 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
|
||||
// No reads here so nothing to do!
|
||||
if( originalActiveRegion.size() == 0 ) { return referenceModelForNoVariation(originalActiveRegion, true); }
|
||||
|
||||
logReadInfo(DEBUG_READ_NAME, originalActiveRegion.getReads(), "Present in original active region");
|
||||
|
||||
// create the assembly using just high quality reads (Q20 or higher). We want to use lower
|
||||
// quality reads in the PairHMM (and especially in the normal) later, so we can't use a ReadFilter
|
||||
ActiveRegion assemblyActiveRegion = new ActiveRegion(originalActiveRegion.getLocation(), originalActiveRegion.getSupportingStates(),originalActiveRegion.isActive(), getToolkit().getGenomeLocParser(), originalActiveRegion.getExtension());
|
||||
for (GATKSAMRecord rec : originalActiveRegion.getReads()) {
|
||||
final ActiveRegion assemblyActiveRegion = new ActiveRegion(originalActiveRegion.getLocation(), originalActiveRegion.getSupportingStates(),originalActiveRegion.isActive(), getToolkit().getGenomeLocParser(), originalActiveRegion.getExtension());
|
||||
for (final GATKSAMRecord rec : originalActiveRegion.getReads()) {
|
||||
if (rec.getMappingQuality() >= MQthreshold ) {
|
||||
assemblyActiveRegion.add(rec);
|
||||
}
|
||||
|
|
@ -558,18 +612,11 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
// Stop the trimming madness!!!
|
||||
if (!trimmingResult.isVariationPresent())
|
||||
return referenceModelForNoVariation(originalActiveRegion,false);
|
||||
|
||||
logReadInfo(DEBUG_READ_NAME, trimmingResult.getCallableRegion().getReads(), "Present in trimming result");
|
||||
|
||||
final AssemblyResultSet assemblyResult =
|
||||
trimmingResult.needsTrimming() ? untrimmedAssemblyResult.trimTo(trimmingResult.getCallableRegion()) : untrimmedAssemblyResult;
|
||||
|
||||
// final AssemblyResultSet assemblyResult = untrimmedAssemblyResult;
|
||||
|
||||
// after talking to Ryan -- they grab the reads out of the assembly (and trim then) to pass into the PairHMM
|
||||
// because at one point they were trying error correcting of the reads based on the haplotypes.. but that is not
|
||||
// working out, so it's safe for us just to take the reads
|
||||
//
|
||||
final ActiveRegion regionForGenotyping = assemblyResult.getRegionForGenotyping();
|
||||
logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping");
|
||||
|
||||
|
|
@ -577,10 +624,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
haplotypeBAMWriter.addDroppedReadsFromDelta(DroppedReadsTracker.Reason.TRIMMMED, originalActiveRegion.getReads(), regionForGenotyping.getReads());
|
||||
}
|
||||
|
||||
//
|
||||
// final ActiveRegion regionForGenotyping = trimmingResult.getCallableRegion();
|
||||
|
||||
// final ActiveRegion regionForGenotyping = originalActiveRegion;
|
||||
|
||||
// filter out reads from genotyping which fail mapping quality based criteria
|
||||
//TODO - why don't do this before any assembly is done? Why not just once at the beginning of this method
|
||||
|
|
@ -594,7 +637,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
}
|
||||
|
||||
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample(filteredReads);
|
||||
|
||||
logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping after filtering reads");
|
||||
|
||||
// abort early if something is out of the acceptable range
|
||||
|
|
@ -612,17 +654,17 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
|
||||
final List<Haplotype> haplotypes = assemblyResult.getHaplotypeList();
|
||||
final Map<String,List<GATKSAMRecord>> reads = splitReadsBySample( regionForGenotyping.getReads() );
|
||||
for (List<GATKSAMRecord> rec : reads.values()) {
|
||||
for (final List<GATKSAMRecord> rec : reads.values()) {
|
||||
logReadInfo(DEBUG_READ_NAME, rec, "Present after splitting assemblyResult by sample");
|
||||
}
|
||||
|
||||
final HashMap<String, Integer> ARreads_origNormalMQ = new HashMap<>();
|
||||
for (GATKSAMRecord read : regionForGenotyping.getReads()) {
|
||||
for (final GATKSAMRecord read : regionForGenotyping.getReads()) {
|
||||
ARreads_origNormalMQ.put(read.getReadName(), read.getMappingQuality());
|
||||
}
|
||||
|
||||
// modify MAPQ scores in normal to be high so that we don't do any base quality score capping
|
||||
for(GATKSAMRecord rec : regionForGenotyping.getReads()) {
|
||||
for(final GATKSAMRecord rec : regionForGenotyping.getReads()) {
|
||||
if (isReadFromNormal(rec)) {
|
||||
rec.setMappingQuality(60);
|
||||
}
|
||||
|
|
@ -647,8 +689,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
}
|
||||
|
||||
readLikelihoods.changeReads(readRealignments);
|
||||
|
||||
for (GATKSAMRecord rec : readRealignments.keySet()) {
|
||||
for (final GATKSAMRecord rec : readRealignments.keySet()) {
|
||||
logReadInfo(DEBUG_READ_NAME, rec, "Present after computing read likelihoods");
|
||||
}
|
||||
|
||||
|
|
@ -666,15 +707,8 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
assemblyResult.getFullReferenceWithPadding(),
|
||||
assemblyResult.getPaddedReferenceLoc(),
|
||||
regionForGenotyping.getLocation(),
|
||||
getToolkit().getGenomeLocParser(),
|
||||
metaDataTracker,
|
||||
givenAlleles, false ,
|
||||
tumorSampleName,
|
||||
normalSampleName,
|
||||
dbsnp.dbsnp,
|
||||
cosmicRod,
|
||||
DEBUG_READ_NAME
|
||||
);
|
||||
givenAlleles);
|
||||
|
||||
if ( MTAC.bamWriter != null ) {
|
||||
final Set<Haplotype> calledHaplotypeSet = new HashSet<>(calledHaplotypes.getCalledHaplotypes());
|
||||
|
|
@ -695,16 +729,16 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
if( MTAC.DEBUG ) { logger.info("----------------------------------------------------------------------------------"); }
|
||||
|
||||
|
||||
List<VariantContext> annotatedCalls = new ArrayList<>();
|
||||
int eventCount = calledHaplotypes.getCalls().size();
|
||||
final List<VariantContext> annotatedCalls = new ArrayList<>();
|
||||
final int eventCount = calledHaplotypes.getCalls().size();
|
||||
Integer minEventDistance = null;
|
||||
Integer maxEventDistance = null;
|
||||
Integer lastPosition = null;
|
||||
for (VariantContext vc : calledHaplotypes.getCalls()) {
|
||||
for (final VariantContext vc : calledHaplotypes.getCalls()) {
|
||||
if (lastPosition == null) {
|
||||
lastPosition = vc.getStart();
|
||||
} else {
|
||||
int dist = Math.abs(vc.getStart() - lastPosition);
|
||||
final int dist = Math.abs(vc.getStart() - lastPosition);
|
||||
if (maxEventDistance == null || dist > maxEventDistance) {
|
||||
maxEventDistance = dist;
|
||||
}
|
||||
|
|
@ -713,23 +747,23 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
}
|
||||
}
|
||||
}
|
||||
Map<String, Object> eventDistanceAttributes = new HashMap<>();
|
||||
final Map<String, Object> eventDistanceAttributes = new HashMap<>();
|
||||
eventDistanceAttributes.put(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, eventCount);
|
||||
eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, minEventDistance);
|
||||
eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, maxEventDistance);
|
||||
|
||||
|
||||
// can we do this with the Annotation classes instead?
|
||||
for (VariantContext originalVC : calledHaplotypes.getCalls()) {
|
||||
VariantContextBuilder vcb = new VariantContextBuilder(originalVC);
|
||||
for (final VariantContext originalVC : calledHaplotypes.getCalls()) {
|
||||
final VariantContextBuilder vcb = new VariantContextBuilder(originalVC);
|
||||
|
||||
Map<String, Object> attributes = new HashMap<>(originalVC.getAttributes());
|
||||
final Map<String, Object> attributes = new HashMap<>(originalVC.getAttributes());
|
||||
attributes.putAll(eventDistanceAttributes);
|
||||
vcb.attributes(attributes);
|
||||
|
||||
Set<String> filters = new HashSet<>(originalVC.getFilters());
|
||||
final Set<String> filters = new HashSet<>(originalVC.getFilters());
|
||||
|
||||
double tumorLod = originalVC.getAttributeAsDouble(GATKVCFConstants.TUMOR_LOD_KEY, -1);
|
||||
final double tumorLod = originalVC.getAttributeAsDouble(GATKVCFConstants.TUMOR_LOD_KEY, -1);
|
||||
if (tumorLod < MTAC.TUMOR_LOD_THRESHOLD) {
|
||||
filters.add(GATKVCFConstants.TUMOR_LOD_FILTER_NAME);
|
||||
}
|
||||
|
|
@ -739,22 +773,12 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
filters.addAll(calculateFilters(metaDataTracker, originalVC, eventDistanceAttributes));
|
||||
}
|
||||
|
||||
if (filters.size() > 0) {
|
||||
vcb.filters(filters);
|
||||
} else {
|
||||
vcb.passFilters();
|
||||
}
|
||||
vcb.filters(filters.isEmpty() ? VariantContext.PASSES_FILTERS : filters);
|
||||
|
||||
if (printTCGAsampleHeader) {
|
||||
GenotypesContext genotypesWithBamSampleNames = originalVC.getGenotypes();
|
||||
List<Genotype> renamedGenotypes = new ArrayList<>();
|
||||
GenotypeBuilder GTbuilder = new GenotypeBuilder(genotypesWithBamSampleNames.get(tumorSampleName));
|
||||
GTbuilder.name("TUMOR");
|
||||
renamedGenotypes.add(GTbuilder.make());
|
||||
GTbuilder = new GenotypeBuilder(genotypesWithBamSampleNames.get(normalSampleName));
|
||||
GTbuilder.name("NORMAL");
|
||||
renamedGenotypes.add(GTbuilder.make());
|
||||
vcb.genotypes(renamedGenotypes);
|
||||
final Genotype tumorGenotype = new GenotypeBuilder(originalVC.getGenotype(tumorSampleName)).name("TUMOR").make();
|
||||
final Genotype normalGenotype = new GenotypeBuilder(originalVC.getGenotype(normalSampleName)).name("NORMAL").make();
|
||||
vcb.genotypes(Arrays.asList(tumorGenotype, normalGenotype));
|
||||
}
|
||||
|
||||
annotatedCalls.add(vcb.make());
|
||||
|
|
@ -765,15 +789,15 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
return annotatedCalls;
|
||||
}
|
||||
|
||||
private Set<String> calculateFilters(RefMetaDataTracker metaDataTracker, VariantContext vc, Map<String, Object> eventDistanceAttributes) {
|
||||
Set<String> filters = new HashSet<>();
|
||||
private Set<String> calculateFilters(final RefMetaDataTracker metaDataTracker, final VariantContext vc, final Map<String, Object> eventDistanceAttributes) {
|
||||
final Set<String> filters = new HashSet<>();
|
||||
|
||||
Integer eventCount = (Integer) eventDistanceAttributes.get(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY);
|
||||
Integer maxEventDistance = (Integer) eventDistanceAttributes.get(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY);
|
||||
final Integer eventCount = (Integer) eventDistanceAttributes.get(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY);
|
||||
final Integer maxEventDistance = (Integer) eventDistanceAttributes.get(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY);
|
||||
|
||||
Collection<VariantContext> panelOfNormalsVC = metaDataTracker.getValues(normalPanelRod,
|
||||
final Collection<VariantContext> panelOfNormalsVC = metaDataTracker.getValues(MTAC.normalPanelRod,
|
||||
getToolkit().getGenomeLocParser().createGenomeLoc(vc.getChr(), vc.getStart()));
|
||||
VariantContext ponVc = panelOfNormalsVC.isEmpty()?null:panelOfNormalsVC.iterator().next();
|
||||
final VariantContext ponVc = panelOfNormalsVC.isEmpty()?null:panelOfNormalsVC.iterator().next();
|
||||
|
||||
if (ponVc != null) {
|
||||
filters.add(GATKVCFConstants.PON_FILTER_NAME);
|
||||
|
|
@ -788,13 +812,13 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
double normalF = 0;
|
||||
int normalAltQualityScoreSum = 0;
|
||||
if (hasNormal()) {
|
||||
Genotype normalGenotype = vc.getGenotype(normalSampleName);
|
||||
final Genotype normalGenotype = vc.getGenotype(normalSampleName);
|
||||
|
||||
// NOTE: how do we get the non-ref depth here?
|
||||
normalAltCounts = normalGenotype.getAD()[1];
|
||||
normalF = (Double) normalGenotype.getExtendedAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY);
|
||||
|
||||
Object qss = normalGenotype.getExtendedAttribute(GATKVCFConstants.QUALITY_SCORE_SUM_KEY);
|
||||
final Object qss = normalGenotype.getExtendedAttribute(GATKVCFConstants.QUALITY_SCORE_SUM_KEY);
|
||||
if (qss != null) {
|
||||
normalAltQualityScoreSum = (Integer) ((Object[]) qss)[1];
|
||||
} else {
|
||||
|
|
@ -819,11 +843,11 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
// such as ACTACTACT -> ACTACT, are overwhelmingly false positives so we
|
||||
// hard filter them out by default
|
||||
if (vc.isIndel()) {
|
||||
ArrayList rpa = (ArrayList) vc.getAttribute(GATKVCFConstants.REPEATS_PER_ALLELE_KEY);
|
||||
String ru = vc.getAttributeAsString(GATKVCFConstants.REPEAT_UNIT_KEY, "");
|
||||
final ArrayList rpa = (ArrayList) vc.getAttribute(GATKVCFConstants.REPEATS_PER_ALLELE_KEY);
|
||||
final String ru = vc.getAttributeAsString(GATKVCFConstants.REPEAT_UNIT_KEY, "");
|
||||
if (rpa != null && rpa.size() > 1 && ru.length() > 1) {
|
||||
int refCount = (Integer) rpa.get(0);
|
||||
int altCount = (Integer) rpa.get(1);
|
||||
final int refCount = (Integer) rpa.get(0);
|
||||
final int altCount = (Integer) rpa.get(1);
|
||||
|
||||
if (refCount - altCount == 1) {
|
||||
filters.add(GATKVCFConstants.STR_CONTRACTION_FILTER_NAME);
|
||||
|
|
@ -840,10 +864,10 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
|
||||
// clustered read position filter
|
||||
if (MTAC.ENABLE_CLUSTERED_READ_POSITION_FILTER){
|
||||
Double tumorFwdPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY);
|
||||
Double tumorRevPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_RIGHT_OFFSET_KEY);
|
||||
Double tumorFwdPosMAD = (Double) vc.getAttribute(GATKVCFConstants.MAD_MEDIAN_LEFT_OFFSET_KEY);
|
||||
Double tumorRevPosMAD = (Double) vc.getAttribute(GATKVCFConstants.MAD_MEDIAN_RIGHT_OFFSET_KEY);
|
||||
final Double tumorFwdPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY);
|
||||
final Double tumorRevPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_RIGHT_OFFSET_KEY);
|
||||
final Double tumorFwdPosMAD = (Double) vc.getAttribute(GATKVCFConstants.MAD_MEDIAN_LEFT_OFFSET_KEY);
|
||||
final Double tumorRevPosMAD = (Double) vc.getAttribute(GATKVCFConstants.MAD_MEDIAN_RIGHT_OFFSET_KEY);
|
||||
//If the variant is near the read end (median threshold) and the positions are very similar (MAD threshold) then filter
|
||||
if ( (tumorFwdPosMedian != null && tumorFwdPosMedian <= MTAC.PIR_MEDIAN_THRESHOLD && tumorFwdPosMAD != null && tumorFwdPosMAD <= MTAC.PIR_MAD_THRESHOLD) ||
|
||||
(tumorRevPosMedian != null && tumorRevPosMedian <= MTAC.PIR_MEDIAN_THRESHOLD && tumorRevPosMAD != null && tumorRevPosMAD <= MTAC.PIR_MAD_THRESHOLD))
|
||||
|
|
@ -866,14 +890,15 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
*/
|
||||
protected double[] calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual, final double f) {
|
||||
final double[] genotypeLikelihoods = new double[2];
|
||||
int AA = 0, AB=1;
|
||||
final int AA = 0;
|
||||
final int AB=1;
|
||||
for( final PileupElement p : pileup ) {
|
||||
final byte qual = (p.isDeletion() ? REF_MODEL_DELETION_QUAL : p.getQual());
|
||||
if( p.isDeletion() || qual > minBaseQual ) {
|
||||
|
||||
// TODO: why not use base qualities here?
|
||||
//double pobs = QualityUtils.qualToErrorProbLog10(qual);
|
||||
double pobs = 1.0d - pow(10, (30 / -10.0));
|
||||
final double pobs = 1.0d - pow(10, (30 / -10.0));
|
||||
if( isNonRef(refBase, p)) {
|
||||
genotypeLikelihoods[AB] += Math.log10(f*pobs + (1-f)*pobs/3.0d);
|
||||
genotypeLikelihoods[AA] += Math.log10((1-pobs)/3);
|
||||
|
|
@ -905,7 +930,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
}
|
||||
|
||||
protected double[] calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual) {
|
||||
double f = calculateF(pileup, refBase, minBaseQual);
|
||||
final double f = calculateF(pileup, refBase, minBaseQual);
|
||||
return calcGenotypeLikelihoodsOfRefVsAny(pileup, refBase, minBaseQual, f);
|
||||
}
|
||||
|
||||
|
|
@ -923,11 +948,10 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
}
|
||||
}
|
||||
}
|
||||
double f = (double) altCount / ((double) refCount + (double) altCount);
|
||||
return f;
|
||||
return (double) altCount / (refCount + altCount);
|
||||
}
|
||||
|
||||
private boolean isNonRef(byte refBase, PileupElement p) {
|
||||
private boolean isNonRef(final byte refBase, final PileupElement p) {
|
||||
return p.getBase() != refBase || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip();
|
||||
}
|
||||
|
||||
|
|
@ -960,8 +984,8 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
return readsToRemove;
|
||||
}
|
||||
|
||||
private static GATKSAMRecord findReadByName(Collection<GATKSAMRecord> reads, String name) {
|
||||
for(GATKSAMRecord read : reads) {
|
||||
private static GATKSAMRecord findReadByName(final Collection<GATKSAMRecord> reads, final String name) {
|
||||
for(final GATKSAMRecord read : reads) {
|
||||
if (name.equals(read.getReadName())) return read;
|
||||
}
|
||||
return null;
|
||||
|
|
@ -1022,7 +1046,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
}
|
||||
|
||||
@Override
|
||||
public Integer reduce(List<VariantContext> callsInRegion, Integer numCalledRegions) {
|
||||
public Integer reduce(final List<VariantContext> callsInRegion, final Integer numCalledRegions) {
|
||||
for( final VariantContext call : callsInRegion ) {
|
||||
vcfWriter.add( call );
|
||||
}
|
||||
|
|
@ -1030,7 +1054,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
}
|
||||
|
||||
@Override
|
||||
public void onTraversalDone(Integer result) {
|
||||
public void onTraversalDone(final Integer result) {
|
||||
// if ( SCAC.emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) ((GVCFWriter)vcfWriter).close(false); // GROSS -- engine forces us to close our own VCF writer since we wrapped it
|
||||
// referenceConfidenceModel.close();
|
||||
//TODO remove the need to call close here for debugging, the likelihood output stream should be managed
|
||||
|
|
@ -1045,114 +1069,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); }
|
||||
public boolean alwaysAppendDbsnpId() { return false; }
|
||||
|
||||
/**
|
||||
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
|
||||
* dbSNP overlap is only used to require more evidence of absence in the normal if the variant in question has been seen before in germline.
|
||||
*/
|
||||
@ArgumentCollection
|
||||
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
|
||||
public RodBinding<VariantContext> getDbsnpRodBinding() { return dbsnp.dbsnp; }
|
||||
|
||||
/**
|
||||
* If a call overlaps with a record from the provided comp track, the INFO field will be annotated
|
||||
* as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field).
|
||||
* Records that are filtered in the comp track will be ignored.
|
||||
* Note that 'dbSNP' has been special-cased (see the --dbsnp argument).
|
||||
*/
|
||||
@Advanced
|
||||
@Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false)
|
||||
public List<RodBinding<VariantContext>> comps = Collections.emptyList();
|
||||
public List<RodBinding<VariantContext>> getCompRodBindings() { return comps; }
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
|
||||
// protected List<String> annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"ClippingRankSumTest", "DepthPerSampleHC"}));
|
||||
// protected List<String> annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"DepthPerAlleleBySample", "BaseQualitySumPerAlleleBy ruSample", "TandemRepeatAnnotator",
|
||||
// "RMSMappingQuality","MappingQualityRankSumTest","FisherStrand","StrandOddsRatio","ReadPosRankSumTest","QualByDepth", "Coverage"}));
|
||||
protected List<String> annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"DepthPerAlleleBySample", "BaseQualitySumPerAlleleBySample", "TandemRepeatAnnotator", "OxoGReadCounts"}));
|
||||
|
||||
/**
|
||||
* Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments,
|
||||
* so annotations will be excluded even if they are explicitly included with the other options.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false)
|
||||
protected List<String> annotationsToExclude = new ArrayList<>(Arrays.asList(new String[]{"SpanningDeletions"}));
|
||||
|
||||
/**
|
||||
* Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups.
|
||||
*/
|
||||
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
|
||||
//protected String[] annotationGroupsToUse = { StandardAnnotation.class.getSimpleName() };
|
||||
protected String[] annotationClassesToUse = { };
|
||||
|
||||
/**
|
||||
* A raw, unfiltered, highly sensitive callset in VCF format.
|
||||
*/
|
||||
@Output(doc="File to which variants should be written")
|
||||
protected VariantContextWriter vcfWriter = null;
|
||||
|
||||
/**
|
||||
* Active region trimmer reference.
|
||||
*/
|
||||
@ArgumentCollection
|
||||
protected ActiveRegionTrimmer trimmer = new ActiveRegionTrimmer();
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false)
|
||||
protected String keepRG = null;
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* The minimum confidence needed for a given base for it to be used in variant calling.
|
||||
*/
|
||||
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false)
|
||||
public byte MIN_BASE_QUALTY_SCORE = 10;
|
||||
|
||||
|
||||
|
||||
// PAIR-HMM-Related Goodness
|
||||
|
||||
// public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE;
|
||||
// public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.AGGRESSIVE;
|
||||
public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.HOSTILE;
|
||||
|
||||
// Parameters to control read error correction
|
||||
@Hidden
|
||||
@Argument(fullName="errorCorrectReads", shortName="errorCorrectReads", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false)
|
||||
protected boolean errorCorrectReads = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="captureAssemblyFailureBAM", shortName="captureAssemblyFailureBAM", doc="If specified, we will write a BAM called assemblyFailure.bam capturing all of the reads that were in the active region when the assembler failed for any reason", required = false)
|
||||
protected boolean captureAssemblyFailureBAM = false;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="If specified, we will not analyze soft clipped bases in the reads", required = false)
|
||||
protected boolean dontUseSoftClippedBases = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false)
|
||||
protected boolean justDetermineActiveRegions = false;
|
||||
|
||||
|
||||
|
||||
|
||||
// reference base padding size
|
||||
private static final int REFERENCE_PADDING = 500;
|
||||
|
||||
private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6;
|
||||
private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument
|
||||
private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* High-level function that runs the assembler on the active region reads,
|
||||
* returning a data structure with the resulting information needed
|
||||
|
|
@ -1224,14 +1140,11 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
if( !clippedRead.isEmpty() && clippedRead.getCigar().getReadLength() > 0 ) {
|
||||
clippedRead = ReadClipper.hardClipToRegion(clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop());
|
||||
if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) {
|
||||
//logger.info("Keeping read " + clippedRead + " start " + clippedRead.getAlignmentStart() + " end " + clippedRead.getAlignmentEnd());
|
||||
readsToUse.add(clippedRead);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO -- Performance optimization: we partition the reads by sample 4 times right now; let's unify that code.
|
||||
|
||||
final List<GATKSAMRecord> downsampledReads = DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart);
|
||||
|
||||
if ( MTAC.bamWriter != null && MTAC.emitDroppedReads ) {
|
||||
|
|
@ -1268,9 +1181,9 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
*
|
||||
* @param reads the list of reads to consider
|
||||
*/
|
||||
private void cleanOverlappingReadPairs(final List<GATKSAMRecord> reads, Set<String> normalSampleNames) {
|
||||
Map<String, List<GATKSAMRecord>> data = splitReadsBySample(reads);
|
||||
for ( String sampleName : data.keySet() ) {
|
||||
private void cleanOverlappingReadPairs(final List<GATKSAMRecord> reads, final Set<String> normalSampleNames) {
|
||||
final Map<String, List<GATKSAMRecord>> data = splitReadsBySample(reads);
|
||||
for ( final String sampleName : data.keySet() ) {
|
||||
final boolean isTumor = !normalSampleNames.contains(sampleName);
|
||||
final List<GATKSAMRecord> perSampleReadList = data.get(sampleName);
|
||||
|
||||
|
|
@ -1284,16 +1197,15 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
}
|
||||
}
|
||||
|
||||
public static void logReadInfo(String readName, Collection<GATKSAMRecord> records, String message) {
|
||||
public static void logReadInfo(final String readName, final Collection<GATKSAMRecord> records, final String message) {
|
||||
if (readName != null) {
|
||||
for (GATKSAMRecord rec : records) {
|
||||
for (final GATKSAMRecord rec : records) {
|
||||
logReadInfo(readName, rec, message);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public static void logReadInfo(String readName, GATKSAMRecord rec, String message) {
|
||||
public static void logReadInfo(final String readName, final GATKSAMRecord rec, final String message) {
|
||||
if (readName != null && rec != null && readName.equals(rec.getReadName())) {
|
||||
logger.info("Found " + rec.toString() + " - " + message);
|
||||
}
|
||||
|
|
@ -1322,7 +1234,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
return result;
|
||||
}
|
||||
|
||||
private boolean isReadFromNormal(GATKSAMRecord rec) {
|
||||
private boolean isReadFromNormal(final GATKSAMRecord rec) {
|
||||
return normalSampleName != null && normalSampleName.equals(rec.getReadGroup().getSample());
|
||||
|
||||
}
|
||||
|
|
@ -1338,7 +1250,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
|||
@Advanced
|
||||
@Argument(fullName="doNotRunPhysicalPhasing", shortName="doNotRunPhysicalPhasing", doc="Disable physical phasing", required = false)
|
||||
protected boolean doNotRunPhysicalPhasing = false;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -54,9 +54,11 @@ package org.broadinstitute.gatk.tools.walkers.cancer.m2;
|
|||
import com.google.java.contract.Ensures;
|
||||
import htsjdk.samtools.util.StringUtil;
|
||||
import htsjdk.variant.variantcontext.*;
|
||||
import org.apache.commons.collections.ListUtils;
|
||||
import org.apache.commons.lang.mutable.MutableDouble;
|
||||
import org.apache.commons.lang.mutable.MutableInt;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculator;
|
||||
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvider;
|
||||
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCallerGenotypingEngine;
|
||||
import org.broadinstitute.gatk.utils.GenomeLoc;
|
||||
|
|
@ -78,16 +80,33 @@ import java.util.*;
|
|||
|
||||
public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
||||
|
||||
protected final M2ArgumentCollection MTAC;
|
||||
private final M2ArgumentCollection MTAC;
|
||||
private final TumorPowerCalculator strandArtifactPowerCalculator;
|
||||
final boolean REF_AND_ALT = false;
|
||||
final boolean ALT_ONLY = true;
|
||||
|
||||
private final String tumorSampleName;
|
||||
private final String matchedNormalSampleName;
|
||||
private final String DEBUG_READ_NAME;
|
||||
|
||||
// {@link GenotypingEngine} requires a non-null {@link AFCalculatorProvider} but this class doesn't need it. Thus we make a dummy
|
||||
private static AFCalculatorProvider DUMMY_AF_CALCULATOR_PROVIDER = new AFCalculatorProvider() {
|
||||
public AFCalculator getInstance(final int ploidy, final int maximumAltAlleles) { return null; }
|
||||
};
|
||||
|
||||
private final static Logger logger = Logger.getLogger(SomaticGenotypingEngine.class);
|
||||
|
||||
public SomaticGenotypingEngine(final M2ArgumentCollection configuration, final SampleList samples, final GenomeLocParser genomeLocParser, final AFCalculatorProvider afCalculatorProvider, final boolean doPhysicalPhasing, final M2ArgumentCollection MTAC) {
|
||||
super(configuration, samples, genomeLocParser, afCalculatorProvider, doPhysicalPhasing);
|
||||
public SomaticGenotypingEngine(final M2ArgumentCollection configuration,
|
||||
final SampleList samples,
|
||||
final GenomeLocParser genomeLocParser,
|
||||
final boolean doPhysicalPhasing,
|
||||
final M2ArgumentCollection MTAC,
|
||||
final String tumorSampleName,
|
||||
final String matchedNormalSampleName,
|
||||
final String DEBUG_READ_NAME) {
|
||||
super(configuration, samples, genomeLocParser, DUMMY_AF_CALCULATOR_PROVIDER, doPhysicalPhasing);
|
||||
this.MTAC = MTAC;
|
||||
this.tumorSampleName = tumorSampleName;
|
||||
this.matchedNormalSampleName = matchedNormalSampleName;
|
||||
this.DEBUG_READ_NAME = DEBUG_READ_NAME;
|
||||
// coverage related initialization
|
||||
final double errorProbability = Math.pow(10, -(MTAC.POWER_CONSTANT_QSCORE/10));
|
||||
strandArtifactPowerCalculator = new TumorPowerCalculator(errorProbability, MTAC.STRAND_ARTIFACT_LOD_THRESHOLD, 0.0f);
|
||||
|
|
@ -105,37 +124,22 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
|||
* @param ref Reference bytes at active region
|
||||
* @param refLoc Corresponding active region genome location
|
||||
* @param activeRegionWindow Active window
|
||||
* @param genomeLocParser GenomeLocParser
|
||||
* @param activeAllelesToGenotype Alleles to genotype
|
||||
* @param emitReferenceConfidence whether we should add a <NON_REF> alternative allele to the result variation contexts.
|
||||
*
|
||||
* @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes
|
||||
*
|
||||
*/
|
||||
// @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
|
||||
@Ensures("result != null")
|
||||
// TODO - can this be refactored? this is hard to follow!
|
||||
public CalledHaplotypes callMutations (
|
||||
final List<Haplotype> haplotypes,
|
||||
//final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
|
||||
final ReadLikelihoods<Haplotype> readLikelihoods,
|
||||
final Map<String, Integer> originalNormalReadQualities,
|
||||
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
|
||||
final byte[] ref,
|
||||
final GenomeLoc refLoc,
|
||||
final GenomeLoc activeRegionWindow,
|
||||
final GenomeLocParser genomeLocParser,
|
||||
final RefMetaDataTracker tracker,
|
||||
final List<VariantContext> activeAllelesToGenotype,
|
||||
final boolean emitReferenceConfidence,
|
||||
final String tumorSampleName,
|
||||
final String matchedNormalSampleName,
|
||||
final RodBinding<VariantContext> dbsnpRod,
|
||||
final List<RodBinding<VariantContext>> cosmicRod,
|
||||
final String DEBUG_READ_NAME
|
||||
|
||||
) {
|
||||
|
||||
final List<Haplotype> haplotypes,
|
||||
final ReadLikelihoods<Haplotype> readLikelihoods,
|
||||
final Map<String, Integer> originalNormalReadQualities,
|
||||
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
|
||||
final byte[] ref,
|
||||
final GenomeLoc refLoc,
|
||||
final GenomeLoc activeRegionWindow,
|
||||
final RefMetaDataTracker tracker,
|
||||
final List<VariantContext> activeAllelesToGenotype) {
|
||||
// sanity check input arguments
|
||||
if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes);
|
||||
if (readLikelihoods == null || readLikelihoods.sampleCount() == 0) throw new IllegalArgumentException("readLikelihoods input should be non-empty and non-null, got "+readLikelihoods);
|
||||
|
|
@ -143,8 +147,6 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
|||
if (refLoc == null || refLoc.size() != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc);
|
||||
if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow);
|
||||
if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype);
|
||||
if (genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser must be non-null, got "+genomeLocParser);
|
||||
|
||||
|
||||
// Somatic Tumor/Normal Sample Handling
|
||||
verifySamplePresence(tumorSampleName, readLikelihoods.samples());
|
||||
|
|
@ -159,293 +161,217 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
|||
final List<VariantContext> returnCalls = new ArrayList<>();
|
||||
|
||||
for( final int loc : startPosKeySet ) {
|
||||
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region
|
||||
final List<VariantContext> eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype);
|
||||
|
||||
if( eventsAtThisLoc.isEmpty() ) { continue; }
|
||||
|
||||
// Create the event mapping object which maps the original haplotype events to the events present at just this locus
|
||||
final Map<Event, List<Haplotype>> eventMapper = createEventMapper(loc, eventsAtThisLoc, haplotypes);
|
||||
|
||||
// Sanity check the priority list for mistakes
|
||||
final List<String> priorityList = makePriorityList(eventsAtThisLoc);
|
||||
|
||||
// Merge the event to find a common reference representation
|
||||
|
||||
VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList,
|
||||
GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
|
||||
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
|
||||
|
||||
if( mergedVC == null ) { continue; }
|
||||
|
||||
if (emitReferenceConfidence)
|
||||
mergedVC = addNonRefSymbolicAllele(mergedVC);
|
||||
|
||||
final Map<VariantContext, Allele> mergeMap = new LinkedHashMap<>();
|
||||
mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele
|
||||
for(int iii = 0; iii < eventsAtThisLoc.size(); iii++) {
|
||||
mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function
|
||||
}
|
||||
|
||||
final Map<Allele, List<Haplotype>> alleleMapper = createAlleleMapper(mergeMap, eventMapper);
|
||||
|
||||
if( configuration.DEBUG && logger != null ) {
|
||||
if (logger != null) logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
|
||||
}
|
||||
|
||||
ReadLikelihoods<Allele> readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION));
|
||||
|
||||
//LDG: do we want to do this before or after pulling out overlapping reads?
|
||||
if (MTAC.isSampleContaminationPresent())
|
||||
readAlleleLikelihoods.contaminationDownsampling(MTAC.getSampleContamination());
|
||||
|
||||
//if (!mergedVC.isBiallelic()) {
|
||||
// logger.info("[UNSUPPORTED] Detected non-Biallelic VC" + mergedVC.toString());
|
||||
// continue;
|
||||
//}
|
||||
|
||||
// TODO: once tests are passing, refactor to use the new data structure (not the deprecated one)
|
||||
// handle overlapping fragments
|
||||
// TODO: CONFIRM WITH GSA IF IT IS OK TO REMOVE READS FROM THE PRALM (should be... they do it in filterPoorlyModeledReads!)
|
||||
PerReadAlleleLikelihoodMap tumorPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(tumorSampleName));
|
||||
filterPRALMForOverlappingReads(tumorPRALM, mergedVC.getReference(), loc, false);
|
||||
MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in Tumor PRALM after filtering for overlapping reads");
|
||||
// extend to multiple samples
|
||||
|
||||
// compute tumor LOD for each alternate allele
|
||||
// TODO: somewhere we have to ensure that the all the alleles in the variant context is in alleleFractions passed to getHetGenotypeLogLikelihoods. getHetGenotypeLogLikelihoods will not check that for you
|
||||
final PerAlleleCollection<Double> altAlleleFractions = estimateAlleleFraction(mergedVC, tumorPRALM, false);
|
||||
final PerAlleleCollection<Double> tumorHetGenotypeLLs = getHetGenotypeLogLikelihoods(mergedVC, tumorPRALM, originalNormalReadQualities, altAlleleFractions);
|
||||
|
||||
if( configuration.DEBUG && logger != null ) {
|
||||
StringBuilder outputSB = new StringBuilder("Calculated allelic fraction at " + loc + " = [");
|
||||
for (Allele allele : altAlleleFractions.getAltAlleles()){
|
||||
outputSB.append( allele + ": " + altAlleleFractions.getAlt(allele) + ", ");
|
||||
}
|
||||
outputSB.append("]");
|
||||
logger.info(outputSB.toString());
|
||||
}
|
||||
|
||||
final PerAlleleCollection<Double> tumorLods = PerAlleleCollection.createPerAltAlleleCollection();
|
||||
for (Allele altAllele : mergedVC.getAlternateAlleles()){
|
||||
tumorLods.set(altAllele, tumorHetGenotypeLLs.get(altAllele) - tumorHetGenotypeLLs.getRef());
|
||||
}
|
||||
|
||||
if (configuration.DEBUG && logger != null) {
|
||||
StringBuilder outputSB = new StringBuilder("Tumor LOD at " + loc + " = [");
|
||||
for (Allele altAllele : tumorLods.getAltAlleles()) {
|
||||
outputSB.append( altAllele + ": " + tumorLods.getAlt(altAllele) + ", ");
|
||||
}
|
||||
outputSB.append("]");
|
||||
logger.info(outputSB.toString());
|
||||
}
|
||||
|
||||
double INIT_NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE;
|
||||
double NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE;
|
||||
PerReadAlleleLikelihoodMap normalPRALM = null;
|
||||
PerAlleleCollection<Double> normalLods = PerAlleleCollection.createPerAltAlleleCollection();
|
||||
|
||||
// if normal bam is available, compute normal LOD
|
||||
if (hasNormal) {
|
||||
normalPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(matchedNormalSampleName));
|
||||
filterPRALMForOverlappingReads(normalPRALM, mergedVC.getReference(), loc, true);
|
||||
MuTect2.logReadInfo(DEBUG_READ_NAME, normalPRALM.getLikelihoodReadMap().keySet(), "Present after in Nomral PRALM filtering for overlapping reads");
|
||||
|
||||
GenomeLoc eventGenomeLoc = genomeLocParser.createGenomeLoc(activeRegionWindow.getContig(), loc);
|
||||
Collection<VariantContext> cosmicVC = tracker.getValues(cosmicRod, eventGenomeLoc);
|
||||
Collection<VariantContext> dbsnpVC = tracker.getValues(dbsnpRod, eventGenomeLoc);
|
||||
// remove the effect of cosmic from dbSNP
|
||||
final boolean germlineAtRisk = (!dbsnpVC.isEmpty() && cosmicVC.isEmpty());
|
||||
|
||||
INIT_NORMAL_LOD_THRESHOLD = MTAC.INITIAL_NORMAL_LOD_THRESHOLD; //only set this if this job has a normal
|
||||
NORMAL_LOD_THRESHOLD = (germlineAtRisk)?MTAC.NORMAL_DBSNP_LOD_THRESHOLD:MTAC.NORMAL_LOD_THRESHOLD;
|
||||
|
||||
|
||||
// compute normal LOD = LL(X|REF)/LL(X|ALT) where ALT is the diploid HET with AF = 0.5
|
||||
// note normal LOD is REF over ALT, the reciprocal of the tumor LOD
|
||||
final PerAlleleCollection<Double> diploidHetAlleleFractions = PerAlleleCollection.createPerRefAndAltAlleleCollection();
|
||||
for (final Allele allele : mergedVC.getAlternateAlleles()){
|
||||
diploidHetAlleleFractions.setAlt(allele, 0.5);
|
||||
}
|
||||
|
||||
final PerAlleleCollection<Double> normalGenotypeLLs = getHetGenotypeLogLikelihoods(mergedVC, normalPRALM, originalNormalReadQualities, diploidHetAlleleFractions);
|
||||
|
||||
for (final Allele altAllele : mergedVC.getAlternateAlleles()){
|
||||
normalLods.setAlt(altAllele, normalGenotypeLLs.getRef() - normalGenotypeLLs.getAlt(altAllele));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
int numPassingAlts = 0;
|
||||
Set<Allele> allelesThatPassThreshold = new HashSet<>();
|
||||
Allele alleleWithHighestTumorLOD = null;
|
||||
|
||||
// TODO: use lambda
|
||||
for (Allele altAllele : mergedVC.getAlternateAlleles()) {
|
||||
final boolean passesTumorLodThreshold = tumorLods.getAlt(altAllele) >= MTAC.INITIAL_TUMOR_LOD_THRESHOLD;
|
||||
final boolean passesNormalLodThreshold = hasNormal ? normalLods.getAlt(altAllele) >= INIT_NORMAL_LOD_THRESHOLD : true;
|
||||
if (passesTumorLodThreshold && passesNormalLodThreshold) {
|
||||
numPassingAlts++;
|
||||
allelesThatPassThreshold.add(altAllele);
|
||||
if (alleleWithHighestTumorLOD == null
|
||||
|| tumorLods.getAlt(altAllele) > tumorLods.getAlt(alleleWithHighestTumorLOD)){
|
||||
alleleWithHighestTumorLOD = altAllele;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final boolean emitVariant = numPassingAlts > 0;
|
||||
|
||||
VariantContext call = null;
|
||||
if (emitVariant) {
|
||||
VariantContextBuilder callVcb = new VariantContextBuilder(mergedVC);
|
||||
// FIXME: can simply get first alternate since above we only deal with Bi-allelic sites...
|
||||
int haplotypeCount = alleleMapper.get(mergedVC.getAlternateAllele(0)).size();
|
||||
callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount);
|
||||
callVcb.attribute(GATKVCFConstants.TUMOR_LOD_KEY, tumorLods.getAlt(alleleWithHighestTumorLOD));
|
||||
|
||||
if (hasNormal) {
|
||||
callVcb.attribute(GATKVCFConstants.NORMAL_LOD_KEY, normalLods.getAlt(alleleWithHighestTumorLOD));
|
||||
if (normalLods.getAlt(alleleWithHighestTumorLOD) < NORMAL_LOD_THRESHOLD) {
|
||||
callVcb.filter(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME);
|
||||
}
|
||||
}
|
||||
|
||||
// M1-style strand artifact filter
|
||||
// TODO: move code to MuTect2::calculateFilters()
|
||||
// skip if VC has multiple alleles - it will get filtered later anyway
|
||||
if (MTAC.ENABLE_STRAND_ARTIFACT_FILTER && numPassingAlts == 1) {
|
||||
final PerReadAlleleLikelihoodMap forwardPRALM = new PerReadAlleleLikelihoodMap();
|
||||
final PerReadAlleleLikelihoodMap reversePRALM = new PerReadAlleleLikelihoodMap();
|
||||
splitPRALMintoForwardAndReverseReads(tumorPRALM, forwardPRALM, reversePRALM);
|
||||
|
||||
MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in tumor PRALM after PRALM is split");
|
||||
MuTect2.logReadInfo(DEBUG_READ_NAME, forwardPRALM.getLikelihoodReadMap().keySet(), "Present in forward PRALM after PRALM is split");
|
||||
MuTect2.logReadInfo(DEBUG_READ_NAME, reversePRALM.getLikelihoodReadMap().keySet(), "Present in reverse PRALM after PRALM is split");
|
||||
|
||||
// TODO: build a new type for probability, likelihood, and log_likelihood. e.g. f_fwd :: probability[], tumorGLs_fwd :: likelihood[]
|
||||
// TODO: don't want to call getHetGenotypeLogLikelihoods on more than one alternate alelle. May need to overload it to take a scalar f_fwd.
|
||||
final PerAlleleCollection<Double> alleleFractionsForward = estimateAlleleFraction(mergedVC, forwardPRALM, true);
|
||||
final PerAlleleCollection<Double> tumorGenotypeLLForward = getHetGenotypeLogLikelihoods(mergedVC, forwardPRALM, originalNormalReadQualities, alleleFractionsForward);
|
||||
|
||||
final PerAlleleCollection<Double> alleleFractionsReverse = estimateAlleleFraction(mergedVC, reversePRALM, true);
|
||||
final PerAlleleCollection<Double> tumorGenotypeLLReverse = getHetGenotypeLogLikelihoods(mergedVC, reversePRALM, originalNormalReadQualities, alleleFractionsReverse);
|
||||
|
||||
if( configuration.DEBUG && logger != null ) {
|
||||
StringBuilder forwardMessage = new StringBuilder("Calculated forward allelic fraction at " + loc + " = [");
|
||||
StringBuilder reverseMessage = new StringBuilder("Calculated reverse allelic fraction at " + loc + " = [");
|
||||
|
||||
for (Allele altAllele : altAlleleFractions.getAltAlleles()){
|
||||
forwardMessage.append( altAllele + ": " + alleleFractionsForward.getAlt(altAllele) + ", ");
|
||||
reverseMessage.append( altAllele + ": " + alleleFractionsReverse.getAlt(altAllele) + ", ");
|
||||
}
|
||||
|
||||
forwardMessage.append("]");
|
||||
reverseMessage.append("]");
|
||||
|
||||
logger.info(forwardMessage.toString());
|
||||
logger.info(reverseMessage.toString());
|
||||
}
|
||||
|
||||
double tumorLod_fwd = tumorGenotypeLLForward.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLForward.getRef();
|
||||
double tumorLod_rev = tumorGenotypeLLReverse.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLReverse.getRef();
|
||||
|
||||
double tumorSBpower_fwd = 0.0;
|
||||
double tumorSBpower_rev = 0.0;
|
||||
try {
|
||||
// Note that we use the observed combined (+ and -) allele fraction for power calculation in either direction
|
||||
tumorSBpower_fwd = strandArtifactPowerCalculator.cachedPowerCalculation(forwardPRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD));
|
||||
tumorSBpower_rev = strandArtifactPowerCalculator.cachedPowerCalculation(reversePRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD));
|
||||
}
|
||||
catch (Throwable t) {
|
||||
System.err.println("Error processing " + activeRegionWindow.getContig() + ":" + loc);
|
||||
t.printStackTrace(System.err);
|
||||
throw new RuntimeException(t);
|
||||
}
|
||||
|
||||
callVcb.attribute(GATKVCFConstants.TLOD_FWD_KEY, tumorLod_fwd);
|
||||
callVcb.attribute(GATKVCFConstants.TLOD_REV_KEY, tumorLod_rev);
|
||||
callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_FWD_KEY, tumorSBpower_fwd);
|
||||
callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_REV_KEY, tumorSBpower_rev);
|
||||
// TODO: add vcf INFO fields. see callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount);
|
||||
|
||||
if ((tumorSBpower_fwd > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_fwd < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD) ||
|
||||
(tumorSBpower_rev > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_rev < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD))
|
||||
callVcb.filter(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME);
|
||||
}
|
||||
|
||||
// TODO: this probably belongs in M2::calculateFilters()
|
||||
if (numPassingAlts > 1) {
|
||||
callVcb.filter(GATKVCFConstants.TRIALLELIC_SITE_FILTER_NAME);
|
||||
}
|
||||
|
||||
// build genotypes TODO: this part needs review and refactor
|
||||
List<Allele> tumorAlleles = new ArrayList<>();
|
||||
tumorAlleles.add(mergedVC.getReference());
|
||||
tumorAlleles.add(alleleWithHighestTumorLOD);
|
||||
Genotype tumorGenotype = new GenotypeBuilder(tumorSampleName, tumorAlleles)
|
||||
.attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, altAlleleFractions.getAlt(alleleWithHighestTumorLOD))
|
||||
.make(); // TODO: add ADs?
|
||||
List<Genotype> genotypes = new ArrayList<>();
|
||||
genotypes.add(tumorGenotype);
|
||||
|
||||
// We assume that the genotype in the normal is 0/0
|
||||
// TODO: is normal always homozygous reference?
|
||||
List<Allele> homRefAllelesforNormalGenotype = new ArrayList<>();
|
||||
homRefAllelesforNormalGenotype.addAll(Collections.nCopies(2, mergedVC.getReference()));
|
||||
|
||||
// if we are calling with a normal, build the genotype for the sample to appear in vcf
|
||||
int REF = 0, ALT = 1;
|
||||
if (hasNormal) {
|
||||
PerAlleleCollection<Integer> normalCounts = getRefAltCount(mergedVC, normalPRALM, false);
|
||||
final int normalRefAlleleDepth = normalCounts.getRef();
|
||||
final int normalAltAlleleDepth = normalCounts.getAlt(alleleWithHighestTumorLOD);
|
||||
final int[] normalAlleleDepths = { normalRefAlleleDepth, normalAltAlleleDepth };
|
||||
final double normalAlleleFraction = (double) normalAltAlleleDepth / ( normalRefAlleleDepth + normalAltAlleleDepth);
|
||||
|
||||
final Genotype normalGenotype = new GenotypeBuilder(matchedNormalSampleName, homRefAllelesforNormalGenotype)
|
||||
.AD(normalAlleleDepths)
|
||||
.attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, normalAlleleFraction)
|
||||
.make();
|
||||
genotypes.add(normalGenotype);
|
||||
}
|
||||
|
||||
//only use alleles found in the tumor (
|
||||
call = new VariantContextBuilder(callVcb).alleles(tumorAlleles).genotypes(genotypes).make();
|
||||
|
||||
}
|
||||
|
||||
// how should we be making use of _perSampleFilteredReadList_?
|
||||
if( call != null ) {
|
||||
readAlleleLikelihoods = prepareReadAlleleLikelihoodsForAnnotation(readLikelihoods, perSampleFilteredReadList,
|
||||
genomeLocParser, emitReferenceConfidence, alleleMapper, readAlleleLikelihoods, call);
|
||||
|
||||
ReferenceContext referenceContext = new ReferenceContext(genomeLocParser, genomeLocParser.createGenomeLoc(mergedVC.getChr(), mergedVC.getStart(), mergedVC.getEnd()), refLoc, ref);
|
||||
VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(referenceContext, tracker, readAlleleLikelihoods, call, false);
|
||||
|
||||
if( call.getAlleles().size() != mergedVC.getAlleles().size() )
|
||||
annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall);
|
||||
|
||||
// maintain the set of all called haplotypes
|
||||
for ( final Allele calledAllele : call.getAlleles() ) {
|
||||
final List<Haplotype> haplotypeList = alleleMapper.get(calledAllele);
|
||||
if (haplotypeList == null) continue;
|
||||
calledHaplotypes.addAll(haplotypeList);
|
||||
}
|
||||
|
||||
returnCalls.add( annotatedCall );
|
||||
}
|
||||
|
||||
if( loc < activeRegionWindow.getStart() || loc > activeRegionWindow.getStop() ) {
|
||||
continue;
|
||||
}
|
||||
final List<VariantContext> eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype);
|
||||
|
||||
if( eventsAtThisLoc.isEmpty() ) { continue; }
|
||||
|
||||
// Create the event mapping object which maps the original haplotype events to the events present at just this locus
|
||||
final Map<Event, List<Haplotype>> eventMapper = createEventMapper(loc, eventsAtThisLoc, haplotypes);
|
||||
|
||||
final List<String> priorityList = makePriorityList(eventsAtThisLoc);
|
||||
|
||||
VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList,
|
||||
GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
|
||||
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
|
||||
|
||||
if( mergedVC == null ) { continue; }
|
||||
|
||||
final Map<VariantContext, Allele> mergeMap = new LinkedHashMap<>();
|
||||
mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele
|
||||
for(int iii = 0; iii < eventsAtThisLoc.size(); iii++) {
|
||||
mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function
|
||||
}
|
||||
|
||||
final Map<Allele, List<Haplotype>> alleleMapper = createAlleleMapper(mergeMap, eventMapper);
|
||||
|
||||
ReadLikelihoods<Allele> readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION));
|
||||
|
||||
//LDG: do we want to do this before or after pulling out overlapping reads?
|
||||
if (MTAC.isSampleContaminationPresent())
|
||||
readAlleleLikelihoods.contaminationDownsampling(MTAC.getSampleContamination());
|
||||
|
||||
final PerReadAlleleLikelihoodMap tumorPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(tumorSampleName));
|
||||
filterPRALMForOverlappingReads(tumorPRALM, mergedVC.getReference(), loc, false);
|
||||
MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in Tumor PRALM after filtering for overlapping reads");
|
||||
// extend to multiple samples
|
||||
|
||||
// compute tumor LOD for each alternate allele
|
||||
// TODO: somewhere we have to ensure that the all the alleles in the variant context is in alleleFractions passed to getHetGenotypeLogLikelihoods. getHetGenotypeLogLikelihoods will not check that for you
|
||||
final PerAlleleCollection<Double> altAlleleFractions = estimateAlleleFraction(mergedVC, tumorPRALM, false);
|
||||
final PerAlleleCollection<Double> tumorHetGenotypeLLs = getHetGenotypeLogLikelihoods(mergedVC, tumorPRALM, originalNormalReadQualities, altAlleleFractions);
|
||||
|
||||
final PerAlleleCollection<Double> tumorLods = PerAlleleCollection.createPerAltAlleleCollection();
|
||||
for (final Allele altAllele : mergedVC.getAlternateAlleles()){
|
||||
tumorLods.set(altAllele, tumorHetGenotypeLLs.get(altAllele) - tumorHetGenotypeLLs.getRef());
|
||||
}
|
||||
|
||||
double INIT_NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE;
|
||||
double NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE;
|
||||
PerReadAlleleLikelihoodMap normalPRALM = null;
|
||||
final PerAlleleCollection<Double> normalLods = PerAlleleCollection.createPerAltAlleleCollection();
|
||||
|
||||
// if normal bam is available, compute normal LOD
|
||||
if (hasNormal) {
|
||||
normalPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(matchedNormalSampleName));
|
||||
filterPRALMForOverlappingReads(normalPRALM, mergedVC.getReference(), loc, true);
|
||||
MuTect2.logReadInfo(DEBUG_READ_NAME, normalPRALM.getLikelihoodReadMap().keySet(), "Present after in Nomral PRALM filtering for overlapping reads");
|
||||
|
||||
final GenomeLoc eventGenomeLoc = genomeLocParser.createGenomeLoc(activeRegionWindow.getContig(), loc);
|
||||
final Collection<VariantContext> cosmicVC = tracker.getValues(MTAC.cosmicRod, eventGenomeLoc);
|
||||
final Collection<VariantContext> dbsnpVC = tracker.getValues(MTAC.dbsnp.dbsnp, eventGenomeLoc);
|
||||
// remove the effect of cosmic from dbSNP
|
||||
final boolean germlineAtRisk = (!dbsnpVC.isEmpty() && cosmicVC.isEmpty());
|
||||
|
||||
INIT_NORMAL_LOD_THRESHOLD = MTAC.INITIAL_NORMAL_LOD_THRESHOLD; //only set this if this job has a normal
|
||||
NORMAL_LOD_THRESHOLD = (germlineAtRisk)?MTAC.NORMAL_DBSNP_LOD_THRESHOLD:MTAC.NORMAL_LOD_THRESHOLD;
|
||||
|
||||
// compute normal LOD = LL(X|REF)/LL(X|ALT) where ALT is the diploid HET with AF = 0.5
|
||||
// note normal LOD is REF over ALT, the reciprocal of the tumor LOD
|
||||
final PerAlleleCollection<Double> diploidHetAlleleFractions = PerAlleleCollection.createPerRefAndAltAlleleCollection();
|
||||
for (final Allele allele : mergedVC.getAlternateAlleles()){
|
||||
diploidHetAlleleFractions.setAlt(allele, 0.5);
|
||||
}
|
||||
|
||||
final PerAlleleCollection<Double> normalGenotypeLLs = getHetGenotypeLogLikelihoods(mergedVC, normalPRALM, originalNormalReadQualities, diploidHetAlleleFractions);
|
||||
|
||||
for (final Allele altAllele : mergedVC.getAlternateAlleles()){
|
||||
normalLods.setAlt(altAllele, normalGenotypeLLs.getRef() - normalGenotypeLLs.getAlt(altAllele));
|
||||
}
|
||||
}
|
||||
|
||||
int numPassingAlts = 0;
|
||||
final Set<Allele> allelesThatPassThreshold = new HashSet<>();
|
||||
Allele alleleWithHighestTumorLOD = null;
|
||||
|
||||
// TODO: use lambda
|
||||
for (final Allele altAllele : mergedVC.getAlternateAlleles()) {
|
||||
final boolean passesTumorLodThreshold = tumorLods.getAlt(altAllele) >= MTAC.INITIAL_TUMOR_LOD_THRESHOLD;
|
||||
final boolean passesNormalLodThreshold = hasNormal ? normalLods.getAlt(altAllele) >= INIT_NORMAL_LOD_THRESHOLD : true;
|
||||
if (passesTumorLodThreshold && passesNormalLodThreshold) {
|
||||
numPassingAlts++;
|
||||
allelesThatPassThreshold.add(altAllele);
|
||||
if (alleleWithHighestTumorLOD == null
|
||||
|| tumorLods.getAlt(altAllele) > tumorLods.getAlt(alleleWithHighestTumorLOD)){
|
||||
alleleWithHighestTumorLOD = altAllele;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (numPassingAlts == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
final VariantContextBuilder callVcb = new VariantContextBuilder(mergedVC);
|
||||
// FIXME: can simply get first alternate since above we only deal with Bi-allelic sites...
|
||||
final int haplotypeCount = alleleMapper.get(mergedVC.getAlternateAllele(0)).size();
|
||||
callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount);
|
||||
callVcb.attribute(GATKVCFConstants.TUMOR_LOD_KEY, tumorLods.getAlt(alleleWithHighestTumorLOD));
|
||||
|
||||
if (hasNormal) {
|
||||
callVcb.attribute(GATKVCFConstants.NORMAL_LOD_KEY, normalLods.getAlt(alleleWithHighestTumorLOD));
|
||||
if (normalLods.getAlt(alleleWithHighestTumorLOD) < NORMAL_LOD_THRESHOLD) {
|
||||
callVcb.filter(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME);
|
||||
}
|
||||
}
|
||||
|
||||
// M1-style strand artifact filter
|
||||
// TODO: move code to MuTect2::calculateFilters()
|
||||
// skip if VC has multiple alleles - it will get filtered later anyway
|
||||
if (MTAC.ENABLE_STRAND_ARTIFACT_FILTER && numPassingAlts == 1) {
|
||||
final PerReadAlleleLikelihoodMap forwardPRALM = new PerReadAlleleLikelihoodMap();
|
||||
final PerReadAlleleLikelihoodMap reversePRALM = new PerReadAlleleLikelihoodMap();
|
||||
splitPRALMintoForwardAndReverseReads(tumorPRALM, forwardPRALM, reversePRALM);
|
||||
|
||||
MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in tumor PRALM after PRALM is split");
|
||||
MuTect2.logReadInfo(DEBUG_READ_NAME, forwardPRALM.getLikelihoodReadMap().keySet(), "Present in forward PRALM after PRALM is split");
|
||||
MuTect2.logReadInfo(DEBUG_READ_NAME, reversePRALM.getLikelihoodReadMap().keySet(), "Present in reverse PRALM after PRALM is split");
|
||||
|
||||
// TODO: build a new type for probability, likelihood, and log_likelihood. e.g. f_fwd :: probability[], tumorGLs_fwd :: likelihood[]
|
||||
// TODO: don't want to call getHetGenotypeLogLikelihoods on more than one alternate alelle. May need to overload it to take a scalar f_fwd.
|
||||
final PerAlleleCollection<Double> alleleFractionsForward = estimateAlleleFraction(mergedVC, forwardPRALM, true);
|
||||
final PerAlleleCollection<Double> tumorGenotypeLLForward = getHetGenotypeLogLikelihoods(mergedVC, forwardPRALM, originalNormalReadQualities, alleleFractionsForward);
|
||||
|
||||
final PerAlleleCollection<Double> alleleFractionsReverse = estimateAlleleFraction(mergedVC, reversePRALM, true);
|
||||
final PerAlleleCollection<Double> tumorGenotypeLLReverse = getHetGenotypeLogLikelihoods(mergedVC, reversePRALM, originalNormalReadQualities, alleleFractionsReverse);
|
||||
|
||||
final double tumorLod_fwd = tumorGenotypeLLForward.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLForward.getRef();
|
||||
final double tumorLod_rev = tumorGenotypeLLReverse.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLReverse.getRef();
|
||||
|
||||
// Note that we use the observed combined (+ and -) allele fraction for power calculation in either direction
|
||||
final double tumorSBpower_fwd = strandArtifactPowerCalculator.cachedPowerCalculation(forwardPRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD));
|
||||
final double tumorSBpower_rev = strandArtifactPowerCalculator.cachedPowerCalculation(reversePRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD));
|
||||
|
||||
|
||||
callVcb.attribute(GATKVCFConstants.TLOD_FWD_KEY, tumorLod_fwd);
|
||||
callVcb.attribute(GATKVCFConstants.TLOD_REV_KEY, tumorLod_rev);
|
||||
callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_FWD_KEY, tumorSBpower_fwd);
|
||||
callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_REV_KEY, tumorSBpower_rev);
|
||||
// TODO: add vcf INFO fields. see callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount);
|
||||
|
||||
if ((tumorSBpower_fwd > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_fwd < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD) ||
|
||||
(tumorSBpower_rev > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_rev < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD))
|
||||
callVcb.filter(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME);
|
||||
}
|
||||
|
||||
// TODO: this probably belongs in M2::calculateFilters()
|
||||
if (numPassingAlts > 1) {
|
||||
callVcb.filter(GATKVCFConstants.TRIALLELIC_SITE_FILTER_NAME);
|
||||
}
|
||||
|
||||
// build genotypes TODO: this part needs review and refactor
|
||||
final List<Allele> tumorAlleles = Arrays.asList(mergedVC.getReference(), alleleWithHighestTumorLOD);
|
||||
final Genotype tumorGenotype = new GenotypeBuilder(tumorSampleName, tumorAlleles)
|
||||
.attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, altAlleleFractions.getAlt(alleleWithHighestTumorLOD))
|
||||
.make(); // TODO: add ADs?
|
||||
final List<Genotype> genotypes = new ArrayList<>();
|
||||
genotypes.add(tumorGenotype);
|
||||
|
||||
// We assume that the genotype in the normal is 0/0
|
||||
// TODO: is normal always homozygous reference?
|
||||
final List<Allele> homRefAllelesforNormalGenotype = Collections.nCopies(2, mergedVC.getReference());
|
||||
|
||||
// if we are calling with a normal, build the genotype for the sample to appear in vcf
|
||||
if (hasNormal) {
|
||||
final PerAlleleCollection<Integer> normalCounts = getRefAltCount(mergedVC, normalPRALM, false);
|
||||
final int normalRefAlleleDepth = normalCounts.getRef();
|
||||
final int normalAltAlleleDepth = normalCounts.getAlt(alleleWithHighestTumorLOD);
|
||||
final int[] normalAlleleDepths = { normalRefAlleleDepth, normalAltAlleleDepth };
|
||||
final double normalAlleleFraction = (double) normalAltAlleleDepth / ( normalRefAlleleDepth + normalAltAlleleDepth);
|
||||
|
||||
final Genotype normalGenotype = new GenotypeBuilder(matchedNormalSampleName, homRefAllelesforNormalGenotype)
|
||||
.AD(normalAlleleDepths)
|
||||
.attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, normalAlleleFraction)
|
||||
.make();
|
||||
genotypes.add(normalGenotype);
|
||||
}
|
||||
|
||||
//only use alleles found in the tumor (
|
||||
final VariantContext call = new VariantContextBuilder(callVcb).alleles(tumorAlleles).genotypes(genotypes).make();
|
||||
|
||||
// how should we be making use of _perSampleFilteredReadList_?
|
||||
readAlleleLikelihoods = prepareReadAlleleLikelihoodsForAnnotation(readLikelihoods, perSampleFilteredReadList,
|
||||
genomeLocParser, false, alleleMapper, readAlleleLikelihoods, call);
|
||||
|
||||
final ReferenceContext referenceContext = new ReferenceContext(genomeLocParser, genomeLocParser.createGenomeLoc(mergedVC.getChr(), mergedVC.getStart(), mergedVC.getEnd()), refLoc, ref);
|
||||
VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(referenceContext, tracker, readAlleleLikelihoods, call, false);
|
||||
|
||||
if( call.getAlleles().size() != mergedVC.getAlleles().size() )
|
||||
annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall);
|
||||
|
||||
// maintain the set of all called haplotypes
|
||||
call.getAlleles().stream().map(alleleMapper::get).filter(Objects::nonNull).forEach(calledHaplotypes::addAll);
|
||||
returnCalls.add( annotatedCall );
|
||||
}
|
||||
|
||||
// TODO: understand effect of enabling this for somatic calling...
|
||||
final List<VariantContext> phasedCalls = doPhysicalPhasing ? phaseCalls(returnCalls, calledHaplotypes) : returnCalls;
|
||||
return new CalledHaplotypes(phasedCalls, calledHaplotypes);
|
||||
//return new CalledHaplotypes(returnCalls, calledHaplotypes);
|
||||
final List<VariantContext> outputCalls = doPhysicalPhasing ? phaseCalls(returnCalls, calledHaplotypes) : returnCalls;
|
||||
return new CalledHaplotypes(outputCalls, calledHaplotypes);
|
||||
}
|
||||
|
||||
private void verifySamplePresence(String sampleName, List<String> samples) {
|
||||
private void verifySamplePresence(final String sampleName, final List<String> samples) {
|
||||
if (!samples.contains(sampleName)) {
|
||||
throw new IllegalArgumentException("Unable to find sample name "+sampleName+"in sample list of " + StringUtil.join(",", samples));
|
||||
}
|
||||
|
|
@ -459,7 +385,7 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
|||
* @param alleleFractions allele fraction(s) for alternate allele(s)
|
||||
*
|
||||
* @return genotype likelihoods for homRef and het for each alternate allele
|
||||
*/
|
||||
*/
|
||||
private PerAlleleCollection<Double> getHetGenotypeLogLikelihoods(final VariantContext mergedVC,
|
||||
final PerReadAlleleLikelihoodMap tumorPRALM,
|
||||
final Map<String, Integer> originalNormalMQs,
|
||||
|
|
@ -470,13 +396,11 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
|||
}
|
||||
|
||||
final PerAlleleCollection<MutableDouble> genotypeLogLikelihoods = PerAlleleCollection.createPerRefAndAltAlleleCollection();
|
||||
for (final Allele allele : mergedVC.getAlleles()){
|
||||
genotypeLogLikelihoods.set(allele, new MutableDouble(0.0));
|
||||
}
|
||||
mergedVC.getAlleles().forEach(a -> genotypeLogLikelihoods.set(a, new MutableDouble(0)));
|
||||
|
||||
final Allele refAllele = mergedVC.getReference();
|
||||
for(Map.Entry<GATKSAMRecord,Map<Allele, Double>> readAlleleLikelihoodMap : tumorPRALM.getLikelihoodReadMap().entrySet()) {
|
||||
Map<Allele, Double> alleleLikelihoodMap = readAlleleLikelihoodMap.getValue();
|
||||
final Map<Allele, Double> alleleLikelihoodMap = readAlleleLikelihoodMap.getValue();
|
||||
if (originalNormalMQs.get(readAlleleLikelihoodMap.getKey().getReadName()) == 0) {
|
||||
continue;
|
||||
}
|
||||
|
|
@ -484,9 +408,9 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
|||
final double readRefLogLikelihood = alleleLikelihoodMap.get(refAllele);
|
||||
genotypeLogLikelihoods.getRef().add(readRefLogLikelihood);
|
||||
|
||||
for (Allele altAllele : alleleFractions.getAltAlleles()) {
|
||||
double readAltLogLikelihood = alleleLikelihoodMap.get(altAllele);
|
||||
double adjustedReadAltLL = Math.log10(
|
||||
for (final Allele altAllele : alleleFractions.getAltAlleles()) {
|
||||
final double readAltLogLikelihood = alleleLikelihoodMap.get(altAllele);
|
||||
final double adjustedReadAltLL = Math.log10(
|
||||
Math.pow(10, readRefLogLikelihood) * (1 - alleleFractions.getAlt(altAllele)) +
|
||||
Math.pow(10, readAltLogLikelihood) * alleleFractions.getAlt(altAllele)
|
||||
);
|
||||
|
|
@ -515,9 +439,9 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
|||
final PerAlleleCollection<Integer> alleleCounts = getRefAltCount(vc, pralm, oneStrandOnly);
|
||||
final PerAlleleCollection<Double> alleleFractions = PerAlleleCollection.createPerAltAlleleCollection();
|
||||
|
||||
int refCount = alleleCounts.getRef();
|
||||
final int refCount = alleleCounts.getRef();
|
||||
for ( final Allele altAllele : vc.getAlternateAlleles() ) {
|
||||
int altCount = alleleCounts.getAlt(altAllele);
|
||||
final int altCount = alleleCounts.getAlt(altAllele);
|
||||
double alleleFraction = (double) altCount / (refCount + altCount);
|
||||
// weird case, but I've seen it happen in one strand cases
|
||||
if (refCount == 0 && altCount == refCount ) {
|
||||
|
|
@ -552,16 +476,12 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
|||
|
||||
|
||||
final PerAlleleCollection<MutableInt> alleleCounts = PerAlleleCollection.createPerRefAndAltAlleleCollection();
|
||||
|
||||
// initialize the allele counts to 0
|
||||
for (final Allele allele : vcAlleles) {
|
||||
alleleCounts.set(allele, new MutableInt(0));
|
||||
}
|
||||
vcAlleles.stream().forEach(a -> alleleCounts.set(a, new MutableInt(0)));
|
||||
|
||||
for (final Map.Entry<GATKSAMRecord, Map<Allele, Double>> readAlleleLikelihoodMap : pralm.getLikelihoodReadMap().entrySet()) {
|
||||
final GATKSAMRecord read = readAlleleLikelihoodMap.getKey();
|
||||
final Map<Allele, Double> alleleLikelihoodMap = readAlleleLikelihoodMap.getValue();
|
||||
MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(alleleLikelihoodMap, vcAlleles);
|
||||
final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(alleleLikelihoodMap, vcAlleles);
|
||||
|
||||
if (read.getMappingQuality() > 0 && mostLikelyAllele.isInformative()) {
|
||||
alleleCounts.get(mostLikelyAllele.getMostLikelyAllele()).increment();
|
||||
|
|
@ -581,18 +501,16 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
|||
}
|
||||
}
|
||||
|
||||
private void filterPRALMForOverlappingReads(PerReadAlleleLikelihoodMap pralm, Allele ref, int location, boolean retainMismatches) {
|
||||
|
||||
Map<GATKSAMRecord, Map<Allele, Double>> m = pralm.getLikelihoodReadMap();
|
||||
|
||||
private void filterPRALMForOverlappingReads(final PerReadAlleleLikelihoodMap pralm, final Allele ref, final int location, final boolean retainMismatches) {
|
||||
final Map<GATKSAMRecord, Map<Allele, Double>> m = pralm.getLikelihoodReadMap();
|
||||
|
||||
// iterate through the reads, if the name has been seen before we have overlapping (potentially) fragments, so handle them
|
||||
Map<String, GATKSAMRecord> nameToRead = new HashMap<>();
|
||||
Set<GATKSAMRecord> readsToKeep = new HashSet<>();
|
||||
final Map<String, GATKSAMRecord> nameToRead = new HashMap<>();
|
||||
final Set<GATKSAMRecord> readsToKeep = new HashSet<>();
|
||||
|
||||
for(GATKSAMRecord rec : m.keySet()) {
|
||||
for(final GATKSAMRecord rec : m.keySet()) {
|
||||
// if we haven't seen it... just record the name and add it to the list of reads to keep
|
||||
GATKSAMRecord existing = nameToRead.get(rec.getReadName());
|
||||
final GATKSAMRecord existing = nameToRead.get(rec.getReadName());
|
||||
if (existing == null) {
|
||||
nameToRead.put(rec.getReadName(), rec);
|
||||
readsToKeep.add(rec);
|
||||
|
|
@ -604,11 +522,11 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
|||
// TODO: CHECK IF THE READS BOTH OVERLAP THE POSITION!!!!
|
||||
if ( ReadUtils.isInsideRead(existing, location) && ReadUtils.isInsideRead(rec, location) ) {
|
||||
|
||||
MostLikelyAllele existingMLA = pralm.getMostLikelyAllele(pralm.getLikelihoodReadMap().get(existing));
|
||||
Allele existingAllele = existingMLA.getMostLikelyAllele();
|
||||
final MostLikelyAllele existingMLA = PerReadAlleleLikelihoodMap.getMostLikelyAllele(pralm.getLikelihoodReadMap().get(existing));
|
||||
final Allele existingAllele = existingMLA.getMostLikelyAllele();
|
||||
|
||||
MostLikelyAllele recMLA = pralm.getMostLikelyAllele(pralm.getLikelihoodReadMap().get(rec));
|
||||
Allele recAllele = recMLA.getMostLikelyAllele();
|
||||
final MostLikelyAllele recMLA = PerReadAlleleLikelihoodMap.getMostLikelyAllele(pralm.getLikelihoodReadMap().get(rec));
|
||||
final Allele recAllele = recMLA.getMostLikelyAllele();
|
||||
|
||||
// if the reads disagree at this position...
|
||||
if (!existingAllele.equals(recAllele)) {
|
||||
|
|
@ -654,7 +572,7 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
|||
}
|
||||
|
||||
private void splitPRALMintoForwardAndReverseReads(final PerReadAlleleLikelihoodMap originalPRALM, final PerReadAlleleLikelihoodMap forwardPRALM, final PerReadAlleleLikelihoodMap reversePRALM) {
|
||||
Map<GATKSAMRecord, Map<Allele, Double>> origReadAlleleLikelihoodMap = originalPRALM.getLikelihoodReadMap();
|
||||
final Map<GATKSAMRecord, Map<Allele, Double>> origReadAlleleLikelihoodMap = originalPRALM.getLikelihoodReadMap();
|
||||
for (final GATKSAMRecord read : origReadAlleleLikelihoodMap.keySet()) {
|
||||
if (read.isStrandless())
|
||||
continue;
|
||||
|
|
@ -669,17 +587,4 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Move to utility class so we can use one shared with HaplotypeCallerGenotypingEngine
|
||||
private VariantContext addNonRefSymbolicAllele(final VariantContext mergedVC) {
|
||||
final VariantContextBuilder vcb = new VariantContextBuilder(mergedVC);
|
||||
final List<Allele> originalList = mergedVC.getAlleles();
|
||||
final List<Allele> alleleList = new ArrayList<>(originalList.size() + 1);
|
||||
alleleList.addAll(mergedVC.getAlleles());
|
||||
alleleList.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
|
||||
vcb.alleles(alleleList);
|
||||
return vcb.make();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -55,6 +55,7 @@ import org.apache.commons.math.MathException;
|
|||
import org.apache.commons.math.distribution.BinomialDistribution;
|
||||
import org.apache.commons.math.distribution.BinomialDistributionImpl;
|
||||
import org.apache.commons.math3.util.Pair;
|
||||
import org.broadinstitute.gatk.utils.exceptions.GATKException;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
|
|
@ -70,7 +71,6 @@ public class TumorPowerCalculator {
|
|||
private final double tumorLODThreshold;
|
||||
private final double contamination;
|
||||
private final boolean enableSmoothing;
|
||||
public static int numCacheHits = 0;
|
||||
|
||||
private final HashMap<PowerCacheKey, Double> cache = new HashMap<PowerCacheKey, Double>();
|
||||
|
||||
|
|
@ -134,16 +134,18 @@ public class TumorPowerCalculator {
|
|||
* @throws MathException
|
||||
*
|
||||
*/
|
||||
public double cachedPowerCalculation(final int numReads, final double alleleFraction) throws MathException {
|
||||
public double cachedPowerCalculation(final int numReads, final double alleleFraction) {
|
||||
PowerCacheKey key = new PowerCacheKey(numReads, alleleFraction);
|
||||
// we first look up if power for given number of read and allele fraction has already been computed and stored in the cache.
|
||||
// if not we compute it and store it in teh cache.
|
||||
Double power = cache.get(key);
|
||||
if (power == null) {
|
||||
power = calculatePower(numReads, alleleFraction);
|
||||
try {
|
||||
power = calculatePower(numReads, alleleFraction);
|
||||
} catch (final Exception ex) {
|
||||
throw new GATKException("Power calculation failed", ex);
|
||||
}
|
||||
cache.put(key, power);
|
||||
} else {
|
||||
numCacheHits++;
|
||||
}
|
||||
return power;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue