Merge pull request #1453 from broadinstitute/db_mutect2
Lots of small improvements to Mutect2 code
This commit is contained in:
commit
4aede99697
|
|
@ -51,12 +51,41 @@
|
||||||
|
|
||||||
package org.broadinstitute.gatk.tools.walkers.cancer.m2;
|
package org.broadinstitute.gatk.tools.walkers.cancer.m2;
|
||||||
|
|
||||||
|
import htsjdk.variant.variantcontext.VariantContext;
|
||||||
|
import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection;
|
||||||
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.AssemblyBasedCallerArgumentCollection;
|
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.AssemblyBasedCallerArgumentCollection;
|
||||||
import org.broadinstitute.gatk.utils.commandline.Advanced;
|
import org.broadinstitute.gatk.utils.commandline.*;
|
||||||
import org.broadinstitute.gatk.utils.commandline.Argument;
|
|
||||||
import org.broadinstitute.gatk.utils.commandline.Hidden;
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection {
|
public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection {
|
||||||
|
|
||||||
|
/***************************************/
|
||||||
|
// Reference Metadata inputs
|
||||||
|
/***************************************/
|
||||||
|
/**
|
||||||
|
* MuTect2 has the ability to use COSMIC data in conjunction with dbSNP to adjust the threshold for evidence of a variant
|
||||||
|
* in the normal. If a variant is present in dbSNP, but not in COSMIC, then more evidence is required from the normal
|
||||||
|
* sample to prove the variant is not present in germline.
|
||||||
|
*/
|
||||||
|
@Input(fullName="cosmic", shortName = "cosmic", doc="VCF file of COSMIC sites", required=false)
|
||||||
|
public List<RodBinding<VariantContext>> cosmicRod = Collections.emptyList();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A panel of normals can be a useful (optional) input to help filter out commonly seen sequencing noise that may appear as low allele-fraction somatic variants.
|
||||||
|
*/
|
||||||
|
@Input(fullName="normal_panel", shortName = "PON", doc="VCF file of sites observed in normal", required=false)
|
||||||
|
public List<RodBinding<VariantContext>> normalPanelRod = Collections.emptyList();
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
|
||||||
|
* dbSNP overlap is only used to require more evidence of absence in the normal if the variant in question has been seen before in germline.
|
||||||
|
*/
|
||||||
|
@ArgumentCollection
|
||||||
|
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
|
||||||
|
|
||||||
@Advanced
|
@Advanced
|
||||||
@Argument(fullName="m2debug", shortName="m2debug", doc="Print out very verbose M2 debug information", required = false)
|
@Argument(fullName="m2debug", shortName="m2debug", doc="Print out very verbose M2 debug information", required = false)
|
||||||
public boolean M2_DEBUG = false;
|
public boolean M2_DEBUG = false;
|
||||||
|
|
|
||||||
|
|
@ -201,7 +201,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
// the genotyping engine
|
// the genotyping engine
|
||||||
protected HaplotypeCallerGenotypingEngine genotypingEngine = null;
|
protected HaplotypeCallerGenotypingEngine genotypingEngine = null;
|
||||||
|
|
||||||
|
|
||||||
private byte MIN_TAIL_QUALITY;
|
private byte MIN_TAIL_QUALITY;
|
||||||
private double log10GlobalReadMismappingRate;
|
private double log10GlobalReadMismappingRate;
|
||||||
|
|
||||||
|
|
@ -214,9 +213,8 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
@ArgumentCollection
|
@ArgumentCollection
|
||||||
protected LikelihoodEngineArgumentCollection LEAC = new LikelihoodEngineArgumentCollection();
|
protected LikelihoodEngineArgumentCollection LEAC = new LikelihoodEngineArgumentCollection();
|
||||||
|
|
||||||
|
|
||||||
@Argument(fullName = "debug_read_name", required = false, doc="trace this read name through the calling process")
|
@Argument(fullName = "debug_read_name", required = false, doc="trace this read name through the calling process")
|
||||||
public String DEBUG_READ_NAME = null;
|
protected String DEBUG_READ_NAME = null;
|
||||||
|
|
||||||
@Hidden
|
@Hidden
|
||||||
@Advanced
|
@Advanced
|
||||||
|
|
@ -224,24 +222,81 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
final public int MQthreshold = 20;
|
final public int MQthreshold = 20;
|
||||||
|
|
||||||
|
|
||||||
/***************************************/
|
public RodBinding<VariantContext> getDbsnpRodBinding() { return MTAC.dbsnp.dbsnp; }
|
||||||
// Reference Metadata inputs
|
|
||||||
/***************************************/
|
|
||||||
/**
|
|
||||||
* MuTect2 has the ability to use COSMIC data in conjunction with dbSNP to adjust the threshold for evidence of a variant
|
|
||||||
* in the normal. If a variant is present in dbSNP, but not in COSMIC, then more evidence is required from the normal
|
|
||||||
* sample to prove the variant is not present in germline.
|
|
||||||
*/
|
|
||||||
@Input(fullName="cosmic", shortName = "cosmic", doc="VCF file of COSMIC sites", required=false)
|
|
||||||
public List<RodBinding<VariantContext>> cosmicRod = Collections.emptyList();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A panel of normals can be a useful (optional) input to help filter out commonly seen sequencing noise that may appear as low allele-fraction somatic variants.
|
|
||||||
*/
|
|
||||||
@Input(fullName="normal_panel", shortName = "PON", doc="VCF file of sites observed in normal", required=false)
|
|
||||||
public List<RodBinding<VariantContext>> normalPanelRod = Collections.emptyList();
|
|
||||||
|
|
||||||
private HaplotypeBAMWriter haplotypeBAMWriter;
|
private HaplotypeBAMWriter haplotypeBAMWriter;
|
||||||
|
/**
|
||||||
|
* If a call overlaps with a record from the provided comp track, the INFO field will be annotated
|
||||||
|
* as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field).
|
||||||
|
* Records that are filtered in the comp track will be ignored.
|
||||||
|
* Note that 'dbSNP' has been special-cased (see the --dbsnp argument).
|
||||||
|
*/
|
||||||
|
@Advanced
|
||||||
|
@Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false)
|
||||||
|
public List<RodBinding<VariantContext>> comps = Collections.emptyList();
|
||||||
|
public List<RodBinding<VariantContext>> getCompRodBindings() { return comps; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations.
|
||||||
|
*/
|
||||||
|
@Advanced
|
||||||
|
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
|
||||||
|
protected List<String> annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"DepthPerAlleleBySample", "BaseQualitySumPerAlleleBySample", "TandemRepeatAnnotator", "OxoGReadCounts"}));
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments,
|
||||||
|
* so annotations will be excluded even if they are explicitly included with the other options.
|
||||||
|
*/
|
||||||
|
@Advanced
|
||||||
|
@Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false)
|
||||||
|
protected List<String> annotationsToExclude = new ArrayList<>(Arrays.asList(new String[]{"SpanningDeletions"}));
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups.
|
||||||
|
*/
|
||||||
|
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
|
||||||
|
protected String[] annotationClassesToUse = { };
|
||||||
|
|
||||||
|
@Output(doc="File to which variants should be written")
|
||||||
|
protected VariantContextWriter vcfWriter = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Active region trimmer reference.
|
||||||
|
*/
|
||||||
|
@ArgumentCollection
|
||||||
|
protected ActiveRegionTrimmer trimmer = new ActiveRegionTrimmer();
|
||||||
|
|
||||||
|
@Hidden
|
||||||
|
@Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false)
|
||||||
|
protected String keepRG = null;
|
||||||
|
|
||||||
|
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false)
|
||||||
|
public byte MIN_BASE_QUALTY_SCORE = 10;
|
||||||
|
|
||||||
|
public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.HOSTILE;
|
||||||
|
|
||||||
|
@Hidden
|
||||||
|
@Argument(fullName="errorCorrectReads", shortName="errorCorrectReads", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false)
|
||||||
|
protected boolean errorCorrectReads = false;
|
||||||
|
|
||||||
|
@Hidden
|
||||||
|
@Argument(fullName="captureAssemblyFailureBAM", shortName="captureAssemblyFailureBAM", doc="If specified, we will write a BAM called assemblyFailure.bam capturing all of the reads that were in the active region when the assembler failed for any reason", required = false)
|
||||||
|
protected boolean captureAssemblyFailureBAM = false;
|
||||||
|
|
||||||
|
@Advanced
|
||||||
|
@Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="If specified, we will not analyze soft clipped bases in the reads", required = false)
|
||||||
|
protected boolean dontUseSoftClippedBases = false;
|
||||||
|
|
||||||
|
@Hidden
|
||||||
|
@Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false)
|
||||||
|
protected boolean justDetermineActiveRegions = false;
|
||||||
|
|
||||||
|
// reference base padding size
|
||||||
|
private static final int REFERENCE_PADDING = 500;
|
||||||
|
|
||||||
|
private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6;
|
||||||
|
private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument
|
||||||
|
private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
|
|
@ -250,15 +305,15 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
samplesList = new IndexedSampleList(new ArrayList<>(ReadUtils.getSAMFileSamples(getToolkit().getSAMFileHeader())));
|
samplesList = new IndexedSampleList(new ArrayList<>(ReadUtils.getSAMFileSamples(getToolkit().getSAMFileHeader())));
|
||||||
|
|
||||||
// MUTECT: check that we have at least one tumor bam
|
// MUTECT: check that we have at least one tumor bam
|
||||||
for(SAMReaderID id : getToolkit().getReadsDataSource().getReaderIDs()) {
|
for(final SAMReaderID id : getToolkit().getReadsDataSource().getReaderIDs()) {
|
||||||
if (id.getTags().getPositionalTags().size() == 0) {
|
if (id.getTags().getPositionalTags().size() == 0) {
|
||||||
throw new RuntimeException("BAMs must be tagged as either 'tumor' or 'normal'");
|
throw new RuntimeException("BAMs must be tagged as either 'tumor' or 'normal'");
|
||||||
}
|
}
|
||||||
|
|
||||||
// only supports single-sample BAMs (ie first read group is representative)
|
// only supports single-sample BAMs (ie first read group is representative)
|
||||||
String bamSampleName = getToolkit().getReadsDataSource().getHeader(id).getReadGroups().get(0).getSample();
|
final String bamSampleName = getToolkit().getReadsDataSource().getHeader(id).getReadGroups().get(0).getSample();
|
||||||
|
|
||||||
for(String tag : id.getTags().getPositionalTags()) {
|
for(final String tag : id.getTags().getPositionalTags()) {
|
||||||
if (BAM_TAG_TUMOR.equalsIgnoreCase(tag)) {
|
if (BAM_TAG_TUMOR.equalsIgnoreCase(tag)) {
|
||||||
tumorSAMReaderIDs.add(id);
|
tumorSAMReaderIDs.add(id);
|
||||||
if (tumorSampleName == null) {
|
if (tumorSampleName == null) {
|
||||||
|
|
@ -330,9 +385,9 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
final MergeVariantsAcrossHaplotypes variantMerger = new MergeVariantsAcrossHaplotypes();
|
final MergeVariantsAcrossHaplotypes variantMerger = new MergeVariantsAcrossHaplotypes();
|
||||||
|
|
||||||
final GenomeAnalysisEngine toolkit = getToolkit();
|
final GenomeAnalysisEngine toolkit = getToolkit();
|
||||||
final GenomeLocParser genomeLocParser = toolkit.getGenomeLocParser();
|
|
||||||
|
|
||||||
genotypingEngine = new SomaticGenotypingEngine( MTAC, samplesList, genomeLocParser, FixedAFCalculatorProvider.createThreadSafeProvider(getToolkit(), MTAC, logger), !doNotRunPhysicalPhasing, MTAC);
|
genotypingEngine = new SomaticGenotypingEngine( MTAC, samplesList, toolkit.getGenomeLocParser(), !doNotRunPhysicalPhasing, MTAC,
|
||||||
|
tumorSampleName, normalSampleName, DEBUG_READ_NAME);
|
||||||
|
|
||||||
genotypingEngine.setCrossHaplotypeEventMerger(variantMerger);
|
genotypingEngine.setCrossHaplotypeEventMerger(variantMerger);
|
||||||
genotypingEngine.setAnnotationEngine(annotationEngine);
|
genotypingEngine.setAnnotationEngine(annotationEngine);
|
||||||
|
|
@ -353,7 +408,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
trimmer.snpPadding = 50;
|
trimmer.snpPadding = 50;
|
||||||
|
|
||||||
samplesList = toolkit.getReadSampleList();
|
samplesList = toolkit.getReadSampleList();
|
||||||
Set<String> sampleSet = SampleListUtils.asSet(samplesList);
|
final Set<String> sampleSet = SampleListUtils.asSet(samplesList);
|
||||||
|
|
||||||
if( MTAC.CONTAMINATION_FRACTION_FILE != null )
|
if( MTAC.CONTAMINATION_FRACTION_FILE != null )
|
||||||
MTAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(MTAC.CONTAMINATION_FRACTION_FILE, MTAC.CONTAMINATION_FRACTION, sampleSet, logger));
|
MTAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(MTAC.CONTAMINATION_FRACTION_FILE, MTAC.CONTAMINATION_FRACTION, sampleSet, logger));
|
||||||
|
|
@ -367,7 +422,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
}
|
}
|
||||||
final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
|
final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
|
||||||
|
|
||||||
Set<VCFHeaderLine> headerInfo = new HashSet<>();
|
final Set<VCFHeaderLine> headerInfo = new HashSet<>();
|
||||||
|
|
||||||
// all annotation fields from VariantAnnotatorEngine
|
// all annotation fields from VariantAnnotatorEngine
|
||||||
headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions());
|
headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions());
|
||||||
|
|
@ -383,7 +438,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
headerInfo.addAll(getM2HeaderLines());
|
headerInfo.addAll(getM2HeaderLines());
|
||||||
headerInfo.addAll(getSampleHeaderLines());
|
headerInfo.addAll(getSampleHeaderLines());
|
||||||
|
|
||||||
List<String> outputSampleNames = getOutputSampleNames();
|
final List<String> outputSampleNames = getOutputSampleNames();
|
||||||
|
|
||||||
vcfWriter.writeHeader(new VCFHeader(headerInfo, outputSampleNames));
|
vcfWriter.writeHeader(new VCFHeader(headerInfo, outputSampleNames));
|
||||||
|
|
||||||
|
|
@ -391,7 +446,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
}
|
}
|
||||||
|
|
||||||
private Set<VCFHeaderLine> getM2HeaderLines(){
|
private Set<VCFHeaderLine> getM2HeaderLines(){
|
||||||
Set<VCFHeaderLine> headerInfo = new HashSet<>();
|
final Set<VCFHeaderLine> headerInfo = new HashSet<>();
|
||||||
headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.NORMAL_LOD_KEY));
|
headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.NORMAL_LOD_KEY));
|
||||||
headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.TUMOR_LOD_KEY));
|
headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.TUMOR_LOD_KEY));
|
||||||
headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.PANEL_OF_NORMALS_COUNT_KEY));
|
headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.PANEL_OF_NORMALS_COUNT_KEY));
|
||||||
|
|
@ -430,21 +485,21 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
}
|
}
|
||||||
|
|
||||||
private Set<VCFHeaderLine> getSampleHeaderLines(){
|
private Set<VCFHeaderLine> getSampleHeaderLines(){
|
||||||
Set<VCFHeaderLine> sampleLines = new HashSet<>();
|
final Set<VCFHeaderLine> sampleLines = new HashSet<>();
|
||||||
if (printTCGAsampleHeader) {
|
if (printTCGAsampleHeader) {
|
||||||
//NOTE: This will only list the first bam file for each tumor/normal sample if there is more than one
|
//NOTE: This will only list the first bam file for each tumor/normal sample if there is more than one
|
||||||
Map<String, String> normalSampleHeaderAttributes = new HashMap<>();
|
final Map<String, String> normalSampleHeaderAttributes = new HashMap<>();
|
||||||
normalSampleHeaderAttributes.put("ID", "NORMAL");
|
normalSampleHeaderAttributes.put("ID", "NORMAL");
|
||||||
normalSampleHeaderAttributes.put("SampleName", normalSampleName);
|
normalSampleHeaderAttributes.put("SampleName", normalSampleName);
|
||||||
if (normalSAMReaderIDs.iterator().hasNext() && !getToolkit().getArguments().disableCommandLineInVCF)
|
if (normalSAMReaderIDs.iterator().hasNext() && !getToolkit().getArguments().disableCommandLineInVCF)
|
||||||
normalSampleHeaderAttributes.put("File", normalSAMReaderIDs.iterator().next().getSamFilePath());
|
normalSampleHeaderAttributes.put("File", normalSAMReaderIDs.iterator().next().getSamFilePath());
|
||||||
VCFSimpleHeaderLine normalSampleHeader = new VCFSimpleHeaderLine("SAMPLE", normalSampleHeaderAttributes);
|
final VCFSimpleHeaderLine normalSampleHeader = new VCFSimpleHeaderLine("SAMPLE", normalSampleHeaderAttributes);
|
||||||
Map<String, String> tumorSampleHeaderAttributes = new HashMap<>();
|
final Map<String, String> tumorSampleHeaderAttributes = new HashMap<>();
|
||||||
tumorSampleHeaderAttributes.put("ID", "TUMOR");
|
tumorSampleHeaderAttributes.put("ID", "TUMOR");
|
||||||
tumorSampleHeaderAttributes.put("SampleName", tumorSampleName);
|
tumorSampleHeaderAttributes.put("SampleName", tumorSampleName);
|
||||||
if (tumorSAMReaderIDs.iterator().hasNext() && !getToolkit().getArguments().disableCommandLineInVCF)
|
if (tumorSAMReaderIDs.iterator().hasNext() && !getToolkit().getArguments().disableCommandLineInVCF)
|
||||||
tumorSampleHeaderAttributes.put("File", tumorSAMReaderIDs.iterator().next().getSamFilePath());
|
tumorSampleHeaderAttributes.put("File", tumorSAMReaderIDs.iterator().next().getSamFilePath());
|
||||||
VCFSimpleHeaderLine tumorSampleHeader = new VCFSimpleHeaderLine("SAMPLE", tumorSampleHeaderAttributes);
|
final VCFSimpleHeaderLine tumorSampleHeader = new VCFSimpleHeaderLine("SAMPLE", tumorSampleHeaderAttributes);
|
||||||
|
|
||||||
sampleLines.add(normalSampleHeader);
|
sampleLines.add(normalSampleHeader);
|
||||||
sampleLines.add(tumorSampleHeader);
|
sampleLines.add(tumorSampleHeader);
|
||||||
|
|
@ -455,7 +510,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
private List<String> getOutputSampleNames(){
|
private List<String> getOutputSampleNames(){
|
||||||
if (printTCGAsampleHeader) {
|
if (printTCGAsampleHeader) {
|
||||||
//Already checked for exactly 1 tumor and 1 normal in printTCGAsampleHeader assignment in initialize()
|
//Already checked for exactly 1 tumor and 1 normal in printTCGAsampleHeader assignment in initialize()
|
||||||
List<String> sampleNamePlaceholders = new ArrayList<>(2);
|
final List<String> sampleNamePlaceholders = new ArrayList<>(2);
|
||||||
sampleNamePlaceholders.add("TUMOR");
|
sampleNamePlaceholders.add("TUMOR");
|
||||||
sampleNamePlaceholders.add("NORMAL");
|
sampleNamePlaceholders.add("NORMAL");
|
||||||
return sampleNamePlaceholders;
|
return sampleNamePlaceholders;
|
||||||
|
|
@ -466,14 +521,14 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ActivityProfileState isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
public ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
|
||||||
if( context == null || context.getBasePileup().isEmpty() )
|
if( context == null || context.getBasePileup().isEmpty() )
|
||||||
// if we don't have any data, just abort early
|
// if we don't have any data, just abort early
|
||||||
return new ActivityProfileState(ref.getLocus(), 0.0);
|
return new ActivityProfileState(ref.getLocus(), 0.0);
|
||||||
|
|
||||||
final Map<String, AlignmentContext> splitContexts = AlignmentContextUtils.splitContextBySampleName(context);
|
final Map<String, AlignmentContext> splitContexts = AlignmentContextUtils.splitContextBySampleName(context);
|
||||||
AlignmentContext tumorContext = splitContexts.get(tumorSampleName);
|
final AlignmentContext tumorContext = splitContexts.get(tumorSampleName);
|
||||||
AlignmentContext normalContext = splitContexts.get(normalSampleName);
|
final AlignmentContext normalContext = splitContexts.get(normalSampleName);
|
||||||
|
|
||||||
// if there are no tumor reads... there is no activity!
|
// if there are no tumor reads... there is no activity!
|
||||||
if (tumorContext == null) {
|
if (tumorContext == null) {
|
||||||
|
|
@ -481,7 +536,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
}
|
}
|
||||||
|
|
||||||
// KCIBUL -- this method was inlined and modified from ReferenceConfidenceModel
|
// KCIBUL -- this method was inlined and modified from ReferenceConfidenceModel
|
||||||
ReadBackedPileup tumorPileup = tumorContext.getBasePileup().getMappingFilteredPileup(MQthreshold);
|
final ReadBackedPileup tumorPileup = tumorContext.getBasePileup().getMappingFilteredPileup(MQthreshold);
|
||||||
final double[] tumorGLs = calcGenotypeLikelihoodsOfRefVsAny(tumorPileup, ref.getBase(), MIN_BASE_QUALTY_SCORE);
|
final double[] tumorGLs = calcGenotypeLikelihoodsOfRefVsAny(tumorPileup, ref.getBase(), MIN_BASE_QUALTY_SCORE);
|
||||||
final double tumorLod = tumorGLs[1] - tumorGLs[0];
|
final double tumorLod = tumorGLs[1] - tumorGLs[0];
|
||||||
|
|
||||||
|
|
@ -495,7 +550,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
// in any case, we have to handle the case where there is no normal (and thus no normal context) which is
|
// in any case, we have to handle the case where there is no normal (and thus no normal context) which is
|
||||||
// different than having a normal but having no reads (where we should not enter the active region)
|
// different than having a normal but having no reads (where we should not enter the active region)
|
||||||
if (normalSampleName != null && normalContext != null) {
|
if (normalSampleName != null && normalContext != null) {
|
||||||
int nonRefInNormal = getCountOfNonRefEvents(normalContext.getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE);
|
final int nonRefInNormal = getCountOfNonRefEvents(normalContext.getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE);
|
||||||
|
|
||||||
final double[] normalGLs = calcGenotypeLikelihoodsOfRefVsAny(normalContext.getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE, 0.5f);
|
final double[] normalGLs = calcGenotypeLikelihoodsOfRefVsAny(normalContext.getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE, 0.5f);
|
||||||
final double normalLod = normalGLs[0] - normalGLs[1];
|
final double normalLod = normalGLs[0] - normalGLs[1];
|
||||||
|
|
@ -528,13 +583,12 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
|
|
||||||
// No reads here so nothing to do!
|
// No reads here so nothing to do!
|
||||||
if( originalActiveRegion.size() == 0 ) { return referenceModelForNoVariation(originalActiveRegion, true); }
|
if( originalActiveRegion.size() == 0 ) { return referenceModelForNoVariation(originalActiveRegion, true); }
|
||||||
|
|
||||||
logReadInfo(DEBUG_READ_NAME, originalActiveRegion.getReads(), "Present in original active region");
|
logReadInfo(DEBUG_READ_NAME, originalActiveRegion.getReads(), "Present in original active region");
|
||||||
|
|
||||||
// create the assembly using just high quality reads (Q20 or higher). We want to use lower
|
// create the assembly using just high quality reads (Q20 or higher). We want to use lower
|
||||||
// quality reads in the PairHMM (and especially in the normal) later, so we can't use a ReadFilter
|
// quality reads in the PairHMM (and especially in the normal) later, so we can't use a ReadFilter
|
||||||
ActiveRegion assemblyActiveRegion = new ActiveRegion(originalActiveRegion.getLocation(), originalActiveRegion.getSupportingStates(),originalActiveRegion.isActive(), getToolkit().getGenomeLocParser(), originalActiveRegion.getExtension());
|
final ActiveRegion assemblyActiveRegion = new ActiveRegion(originalActiveRegion.getLocation(), originalActiveRegion.getSupportingStates(),originalActiveRegion.isActive(), getToolkit().getGenomeLocParser(), originalActiveRegion.getExtension());
|
||||||
for (GATKSAMRecord rec : originalActiveRegion.getReads()) {
|
for (final GATKSAMRecord rec : originalActiveRegion.getReads()) {
|
||||||
if (rec.getMappingQuality() >= MQthreshold ) {
|
if (rec.getMappingQuality() >= MQthreshold ) {
|
||||||
assemblyActiveRegion.add(rec);
|
assemblyActiveRegion.add(rec);
|
||||||
}
|
}
|
||||||
|
|
@ -558,18 +612,11 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
// Stop the trimming madness!!!
|
// Stop the trimming madness!!!
|
||||||
if (!trimmingResult.isVariationPresent())
|
if (!trimmingResult.isVariationPresent())
|
||||||
return referenceModelForNoVariation(originalActiveRegion,false);
|
return referenceModelForNoVariation(originalActiveRegion,false);
|
||||||
|
|
||||||
logReadInfo(DEBUG_READ_NAME, trimmingResult.getCallableRegion().getReads(), "Present in trimming result");
|
logReadInfo(DEBUG_READ_NAME, trimmingResult.getCallableRegion().getReads(), "Present in trimming result");
|
||||||
|
|
||||||
final AssemblyResultSet assemblyResult =
|
final AssemblyResultSet assemblyResult =
|
||||||
trimmingResult.needsTrimming() ? untrimmedAssemblyResult.trimTo(trimmingResult.getCallableRegion()) : untrimmedAssemblyResult;
|
trimmingResult.needsTrimming() ? untrimmedAssemblyResult.trimTo(trimmingResult.getCallableRegion()) : untrimmedAssemblyResult;
|
||||||
|
|
||||||
// final AssemblyResultSet assemblyResult = untrimmedAssemblyResult;
|
|
||||||
|
|
||||||
// after talking to Ryan -- they grab the reads out of the assembly (and trim then) to pass into the PairHMM
|
|
||||||
// because at one point they were trying error correcting of the reads based on the haplotypes.. but that is not
|
|
||||||
// working out, so it's safe for us just to take the reads
|
|
||||||
//
|
|
||||||
final ActiveRegion regionForGenotyping = assemblyResult.getRegionForGenotyping();
|
final ActiveRegion regionForGenotyping = assemblyResult.getRegionForGenotyping();
|
||||||
logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping");
|
logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping");
|
||||||
|
|
||||||
|
|
@ -577,10 +624,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
haplotypeBAMWriter.addDroppedReadsFromDelta(DroppedReadsTracker.Reason.TRIMMMED, originalActiveRegion.getReads(), regionForGenotyping.getReads());
|
haplotypeBAMWriter.addDroppedReadsFromDelta(DroppedReadsTracker.Reason.TRIMMMED, originalActiveRegion.getReads(), regionForGenotyping.getReads());
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
|
||||||
// final ActiveRegion regionForGenotyping = trimmingResult.getCallableRegion();
|
|
||||||
|
|
||||||
// final ActiveRegion regionForGenotyping = originalActiveRegion;
|
|
||||||
|
|
||||||
// filter out reads from genotyping which fail mapping quality based criteria
|
// filter out reads from genotyping which fail mapping quality based criteria
|
||||||
//TODO - why don't do this before any assembly is done? Why not just once at the beginning of this method
|
//TODO - why don't do this before any assembly is done? Why not just once at the beginning of this method
|
||||||
|
|
@ -594,7 +637,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
}
|
}
|
||||||
|
|
||||||
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample(filteredReads);
|
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample(filteredReads);
|
||||||
|
|
||||||
logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping after filtering reads");
|
logReadInfo(DEBUG_READ_NAME, regionForGenotyping.getReads(), "Present in region for genotyping after filtering reads");
|
||||||
|
|
||||||
// abort early if something is out of the acceptable range
|
// abort early if something is out of the acceptable range
|
||||||
|
|
@ -612,17 +654,17 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
|
|
||||||
final List<Haplotype> haplotypes = assemblyResult.getHaplotypeList();
|
final List<Haplotype> haplotypes = assemblyResult.getHaplotypeList();
|
||||||
final Map<String,List<GATKSAMRecord>> reads = splitReadsBySample( regionForGenotyping.getReads() );
|
final Map<String,List<GATKSAMRecord>> reads = splitReadsBySample( regionForGenotyping.getReads() );
|
||||||
for (List<GATKSAMRecord> rec : reads.values()) {
|
for (final List<GATKSAMRecord> rec : reads.values()) {
|
||||||
logReadInfo(DEBUG_READ_NAME, rec, "Present after splitting assemblyResult by sample");
|
logReadInfo(DEBUG_READ_NAME, rec, "Present after splitting assemblyResult by sample");
|
||||||
}
|
}
|
||||||
|
|
||||||
final HashMap<String, Integer> ARreads_origNormalMQ = new HashMap<>();
|
final HashMap<String, Integer> ARreads_origNormalMQ = new HashMap<>();
|
||||||
for (GATKSAMRecord read : regionForGenotyping.getReads()) {
|
for (final GATKSAMRecord read : regionForGenotyping.getReads()) {
|
||||||
ARreads_origNormalMQ.put(read.getReadName(), read.getMappingQuality());
|
ARreads_origNormalMQ.put(read.getReadName(), read.getMappingQuality());
|
||||||
}
|
}
|
||||||
|
|
||||||
// modify MAPQ scores in normal to be high so that we don't do any base quality score capping
|
// modify MAPQ scores in normal to be high so that we don't do any base quality score capping
|
||||||
for(GATKSAMRecord rec : regionForGenotyping.getReads()) {
|
for(final GATKSAMRecord rec : regionForGenotyping.getReads()) {
|
||||||
if (isReadFromNormal(rec)) {
|
if (isReadFromNormal(rec)) {
|
||||||
rec.setMappingQuality(60);
|
rec.setMappingQuality(60);
|
||||||
}
|
}
|
||||||
|
|
@ -647,8 +689,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
}
|
}
|
||||||
|
|
||||||
readLikelihoods.changeReads(readRealignments);
|
readLikelihoods.changeReads(readRealignments);
|
||||||
|
for (final GATKSAMRecord rec : readRealignments.keySet()) {
|
||||||
for (GATKSAMRecord rec : readRealignments.keySet()) {
|
|
||||||
logReadInfo(DEBUG_READ_NAME, rec, "Present after computing read likelihoods");
|
logReadInfo(DEBUG_READ_NAME, rec, "Present after computing read likelihoods");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -666,15 +707,8 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
assemblyResult.getFullReferenceWithPadding(),
|
assemblyResult.getFullReferenceWithPadding(),
|
||||||
assemblyResult.getPaddedReferenceLoc(),
|
assemblyResult.getPaddedReferenceLoc(),
|
||||||
regionForGenotyping.getLocation(),
|
regionForGenotyping.getLocation(),
|
||||||
getToolkit().getGenomeLocParser(),
|
|
||||||
metaDataTracker,
|
metaDataTracker,
|
||||||
givenAlleles, false ,
|
givenAlleles);
|
||||||
tumorSampleName,
|
|
||||||
normalSampleName,
|
|
||||||
dbsnp.dbsnp,
|
|
||||||
cosmicRod,
|
|
||||||
DEBUG_READ_NAME
|
|
||||||
);
|
|
||||||
|
|
||||||
if ( MTAC.bamWriter != null ) {
|
if ( MTAC.bamWriter != null ) {
|
||||||
final Set<Haplotype> calledHaplotypeSet = new HashSet<>(calledHaplotypes.getCalledHaplotypes());
|
final Set<Haplotype> calledHaplotypeSet = new HashSet<>(calledHaplotypes.getCalledHaplotypes());
|
||||||
|
|
@ -695,16 +729,16 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
if( MTAC.DEBUG ) { logger.info("----------------------------------------------------------------------------------"); }
|
if( MTAC.DEBUG ) { logger.info("----------------------------------------------------------------------------------"); }
|
||||||
|
|
||||||
|
|
||||||
List<VariantContext> annotatedCalls = new ArrayList<>();
|
final List<VariantContext> annotatedCalls = new ArrayList<>();
|
||||||
int eventCount = calledHaplotypes.getCalls().size();
|
final int eventCount = calledHaplotypes.getCalls().size();
|
||||||
Integer minEventDistance = null;
|
Integer minEventDistance = null;
|
||||||
Integer maxEventDistance = null;
|
Integer maxEventDistance = null;
|
||||||
Integer lastPosition = null;
|
Integer lastPosition = null;
|
||||||
for (VariantContext vc : calledHaplotypes.getCalls()) {
|
for (final VariantContext vc : calledHaplotypes.getCalls()) {
|
||||||
if (lastPosition == null) {
|
if (lastPosition == null) {
|
||||||
lastPosition = vc.getStart();
|
lastPosition = vc.getStart();
|
||||||
} else {
|
} else {
|
||||||
int dist = Math.abs(vc.getStart() - lastPosition);
|
final int dist = Math.abs(vc.getStart() - lastPosition);
|
||||||
if (maxEventDistance == null || dist > maxEventDistance) {
|
if (maxEventDistance == null || dist > maxEventDistance) {
|
||||||
maxEventDistance = dist;
|
maxEventDistance = dist;
|
||||||
}
|
}
|
||||||
|
|
@ -713,23 +747,23 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Map<String, Object> eventDistanceAttributes = new HashMap<>();
|
final Map<String, Object> eventDistanceAttributes = new HashMap<>();
|
||||||
eventDistanceAttributes.put(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, eventCount);
|
eventDistanceAttributes.put(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, eventCount);
|
||||||
eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, minEventDistance);
|
eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, minEventDistance);
|
||||||
eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, maxEventDistance);
|
eventDistanceAttributes.put(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, maxEventDistance);
|
||||||
|
|
||||||
|
|
||||||
// can we do this with the Annotation classes instead?
|
// can we do this with the Annotation classes instead?
|
||||||
for (VariantContext originalVC : calledHaplotypes.getCalls()) {
|
for (final VariantContext originalVC : calledHaplotypes.getCalls()) {
|
||||||
VariantContextBuilder vcb = new VariantContextBuilder(originalVC);
|
final VariantContextBuilder vcb = new VariantContextBuilder(originalVC);
|
||||||
|
|
||||||
Map<String, Object> attributes = new HashMap<>(originalVC.getAttributes());
|
final Map<String, Object> attributes = new HashMap<>(originalVC.getAttributes());
|
||||||
attributes.putAll(eventDistanceAttributes);
|
attributes.putAll(eventDistanceAttributes);
|
||||||
vcb.attributes(attributes);
|
vcb.attributes(attributes);
|
||||||
|
|
||||||
Set<String> filters = new HashSet<>(originalVC.getFilters());
|
final Set<String> filters = new HashSet<>(originalVC.getFilters());
|
||||||
|
|
||||||
double tumorLod = originalVC.getAttributeAsDouble(GATKVCFConstants.TUMOR_LOD_KEY, -1);
|
final double tumorLod = originalVC.getAttributeAsDouble(GATKVCFConstants.TUMOR_LOD_KEY, -1);
|
||||||
if (tumorLod < MTAC.TUMOR_LOD_THRESHOLD) {
|
if (tumorLod < MTAC.TUMOR_LOD_THRESHOLD) {
|
||||||
filters.add(GATKVCFConstants.TUMOR_LOD_FILTER_NAME);
|
filters.add(GATKVCFConstants.TUMOR_LOD_FILTER_NAME);
|
||||||
}
|
}
|
||||||
|
|
@ -739,22 +773,12 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
filters.addAll(calculateFilters(metaDataTracker, originalVC, eventDistanceAttributes));
|
filters.addAll(calculateFilters(metaDataTracker, originalVC, eventDistanceAttributes));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (filters.size() > 0) {
|
vcb.filters(filters.isEmpty() ? VariantContext.PASSES_FILTERS : filters);
|
||||||
vcb.filters(filters);
|
|
||||||
} else {
|
|
||||||
vcb.passFilters();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (printTCGAsampleHeader) {
|
if (printTCGAsampleHeader) {
|
||||||
GenotypesContext genotypesWithBamSampleNames = originalVC.getGenotypes();
|
final Genotype tumorGenotype = new GenotypeBuilder(originalVC.getGenotype(tumorSampleName)).name("TUMOR").make();
|
||||||
List<Genotype> renamedGenotypes = new ArrayList<>();
|
final Genotype normalGenotype = new GenotypeBuilder(originalVC.getGenotype(normalSampleName)).name("NORMAL").make();
|
||||||
GenotypeBuilder GTbuilder = new GenotypeBuilder(genotypesWithBamSampleNames.get(tumorSampleName));
|
vcb.genotypes(Arrays.asList(tumorGenotype, normalGenotype));
|
||||||
GTbuilder.name("TUMOR");
|
|
||||||
renamedGenotypes.add(GTbuilder.make());
|
|
||||||
GTbuilder = new GenotypeBuilder(genotypesWithBamSampleNames.get(normalSampleName));
|
|
||||||
GTbuilder.name("NORMAL");
|
|
||||||
renamedGenotypes.add(GTbuilder.make());
|
|
||||||
vcb.genotypes(renamedGenotypes);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
annotatedCalls.add(vcb.make());
|
annotatedCalls.add(vcb.make());
|
||||||
|
|
@ -765,15 +789,15 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
return annotatedCalls;
|
return annotatedCalls;
|
||||||
}
|
}
|
||||||
|
|
||||||
private Set<String> calculateFilters(RefMetaDataTracker metaDataTracker, VariantContext vc, Map<String, Object> eventDistanceAttributes) {
|
private Set<String> calculateFilters(final RefMetaDataTracker metaDataTracker, final VariantContext vc, final Map<String, Object> eventDistanceAttributes) {
|
||||||
Set<String> filters = new HashSet<>();
|
final Set<String> filters = new HashSet<>();
|
||||||
|
|
||||||
Integer eventCount = (Integer) eventDistanceAttributes.get(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY);
|
final Integer eventCount = (Integer) eventDistanceAttributes.get(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY);
|
||||||
Integer maxEventDistance = (Integer) eventDistanceAttributes.get(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY);
|
final Integer maxEventDistance = (Integer) eventDistanceAttributes.get(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY);
|
||||||
|
|
||||||
Collection<VariantContext> panelOfNormalsVC = metaDataTracker.getValues(normalPanelRod,
|
final Collection<VariantContext> panelOfNormalsVC = metaDataTracker.getValues(MTAC.normalPanelRod,
|
||||||
getToolkit().getGenomeLocParser().createGenomeLoc(vc.getChr(), vc.getStart()));
|
getToolkit().getGenomeLocParser().createGenomeLoc(vc.getChr(), vc.getStart()));
|
||||||
VariantContext ponVc = panelOfNormalsVC.isEmpty()?null:panelOfNormalsVC.iterator().next();
|
final VariantContext ponVc = panelOfNormalsVC.isEmpty()?null:panelOfNormalsVC.iterator().next();
|
||||||
|
|
||||||
if (ponVc != null) {
|
if (ponVc != null) {
|
||||||
filters.add(GATKVCFConstants.PON_FILTER_NAME);
|
filters.add(GATKVCFConstants.PON_FILTER_NAME);
|
||||||
|
|
@ -788,13 +812,13 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
double normalF = 0;
|
double normalF = 0;
|
||||||
int normalAltQualityScoreSum = 0;
|
int normalAltQualityScoreSum = 0;
|
||||||
if (hasNormal()) {
|
if (hasNormal()) {
|
||||||
Genotype normalGenotype = vc.getGenotype(normalSampleName);
|
final Genotype normalGenotype = vc.getGenotype(normalSampleName);
|
||||||
|
|
||||||
// NOTE: how do we get the non-ref depth here?
|
// NOTE: how do we get the non-ref depth here?
|
||||||
normalAltCounts = normalGenotype.getAD()[1];
|
normalAltCounts = normalGenotype.getAD()[1];
|
||||||
normalF = (Double) normalGenotype.getExtendedAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY);
|
normalF = (Double) normalGenotype.getExtendedAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY);
|
||||||
|
|
||||||
Object qss = normalGenotype.getExtendedAttribute(GATKVCFConstants.QUALITY_SCORE_SUM_KEY);
|
final Object qss = normalGenotype.getExtendedAttribute(GATKVCFConstants.QUALITY_SCORE_SUM_KEY);
|
||||||
if (qss != null) {
|
if (qss != null) {
|
||||||
normalAltQualityScoreSum = (Integer) ((Object[]) qss)[1];
|
normalAltQualityScoreSum = (Integer) ((Object[]) qss)[1];
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -819,11 +843,11 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
// such as ACTACTACT -> ACTACT, are overwhelmingly false positives so we
|
// such as ACTACTACT -> ACTACT, are overwhelmingly false positives so we
|
||||||
// hard filter them out by default
|
// hard filter them out by default
|
||||||
if (vc.isIndel()) {
|
if (vc.isIndel()) {
|
||||||
ArrayList rpa = (ArrayList) vc.getAttribute(GATKVCFConstants.REPEATS_PER_ALLELE_KEY);
|
final ArrayList rpa = (ArrayList) vc.getAttribute(GATKVCFConstants.REPEATS_PER_ALLELE_KEY);
|
||||||
String ru = vc.getAttributeAsString(GATKVCFConstants.REPEAT_UNIT_KEY, "");
|
final String ru = vc.getAttributeAsString(GATKVCFConstants.REPEAT_UNIT_KEY, "");
|
||||||
if (rpa != null && rpa.size() > 1 && ru.length() > 1) {
|
if (rpa != null && rpa.size() > 1 && ru.length() > 1) {
|
||||||
int refCount = (Integer) rpa.get(0);
|
final int refCount = (Integer) rpa.get(0);
|
||||||
int altCount = (Integer) rpa.get(1);
|
final int altCount = (Integer) rpa.get(1);
|
||||||
|
|
||||||
if (refCount - altCount == 1) {
|
if (refCount - altCount == 1) {
|
||||||
filters.add(GATKVCFConstants.STR_CONTRACTION_FILTER_NAME);
|
filters.add(GATKVCFConstants.STR_CONTRACTION_FILTER_NAME);
|
||||||
|
|
@ -840,10 +864,10 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
|
|
||||||
// clustered read position filter
|
// clustered read position filter
|
||||||
if (MTAC.ENABLE_CLUSTERED_READ_POSITION_FILTER){
|
if (MTAC.ENABLE_CLUSTERED_READ_POSITION_FILTER){
|
||||||
Double tumorFwdPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY);
|
final Double tumorFwdPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_LEFT_OFFSET_KEY);
|
||||||
Double tumorRevPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_RIGHT_OFFSET_KEY);
|
final Double tumorRevPosMedian = (Double) vc.getAttribute(GATKVCFConstants.MEDIAN_RIGHT_OFFSET_KEY);
|
||||||
Double tumorFwdPosMAD = (Double) vc.getAttribute(GATKVCFConstants.MAD_MEDIAN_LEFT_OFFSET_KEY);
|
final Double tumorFwdPosMAD = (Double) vc.getAttribute(GATKVCFConstants.MAD_MEDIAN_LEFT_OFFSET_KEY);
|
||||||
Double tumorRevPosMAD = (Double) vc.getAttribute(GATKVCFConstants.MAD_MEDIAN_RIGHT_OFFSET_KEY);
|
final Double tumorRevPosMAD = (Double) vc.getAttribute(GATKVCFConstants.MAD_MEDIAN_RIGHT_OFFSET_KEY);
|
||||||
//If the variant is near the read end (median threshold) and the positions are very similar (MAD threshold) then filter
|
//If the variant is near the read end (median threshold) and the positions are very similar (MAD threshold) then filter
|
||||||
if ( (tumorFwdPosMedian != null && tumorFwdPosMedian <= MTAC.PIR_MEDIAN_THRESHOLD && tumorFwdPosMAD != null && tumorFwdPosMAD <= MTAC.PIR_MAD_THRESHOLD) ||
|
if ( (tumorFwdPosMedian != null && tumorFwdPosMedian <= MTAC.PIR_MEDIAN_THRESHOLD && tumorFwdPosMAD != null && tumorFwdPosMAD <= MTAC.PIR_MAD_THRESHOLD) ||
|
||||||
(tumorRevPosMedian != null && tumorRevPosMedian <= MTAC.PIR_MEDIAN_THRESHOLD && tumorRevPosMAD != null && tumorRevPosMAD <= MTAC.PIR_MAD_THRESHOLD))
|
(tumorRevPosMedian != null && tumorRevPosMedian <= MTAC.PIR_MEDIAN_THRESHOLD && tumorRevPosMAD != null && tumorRevPosMAD <= MTAC.PIR_MAD_THRESHOLD))
|
||||||
|
|
@ -866,14 +890,15 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
*/
|
*/
|
||||||
protected double[] calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual, final double f) {
|
protected double[] calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual, final double f) {
|
||||||
final double[] genotypeLikelihoods = new double[2];
|
final double[] genotypeLikelihoods = new double[2];
|
||||||
int AA = 0, AB=1;
|
final int AA = 0;
|
||||||
|
final int AB=1;
|
||||||
for( final PileupElement p : pileup ) {
|
for( final PileupElement p : pileup ) {
|
||||||
final byte qual = (p.isDeletion() ? REF_MODEL_DELETION_QUAL : p.getQual());
|
final byte qual = (p.isDeletion() ? REF_MODEL_DELETION_QUAL : p.getQual());
|
||||||
if( p.isDeletion() || qual > minBaseQual ) {
|
if( p.isDeletion() || qual > minBaseQual ) {
|
||||||
|
|
||||||
// TODO: why not use base qualities here?
|
// TODO: why not use base qualities here?
|
||||||
//double pobs = QualityUtils.qualToErrorProbLog10(qual);
|
//double pobs = QualityUtils.qualToErrorProbLog10(qual);
|
||||||
double pobs = 1.0d - pow(10, (30 / -10.0));
|
final double pobs = 1.0d - pow(10, (30 / -10.0));
|
||||||
if( isNonRef(refBase, p)) {
|
if( isNonRef(refBase, p)) {
|
||||||
genotypeLikelihoods[AB] += Math.log10(f*pobs + (1-f)*pobs/3.0d);
|
genotypeLikelihoods[AB] += Math.log10(f*pobs + (1-f)*pobs/3.0d);
|
||||||
genotypeLikelihoods[AA] += Math.log10((1-pobs)/3);
|
genotypeLikelihoods[AA] += Math.log10((1-pobs)/3);
|
||||||
|
|
@ -905,7 +930,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
}
|
}
|
||||||
|
|
||||||
protected double[] calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual) {
|
protected double[] calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual) {
|
||||||
double f = calculateF(pileup, refBase, minBaseQual);
|
final double f = calculateF(pileup, refBase, minBaseQual);
|
||||||
return calcGenotypeLikelihoodsOfRefVsAny(pileup, refBase, minBaseQual, f);
|
return calcGenotypeLikelihoodsOfRefVsAny(pileup, refBase, minBaseQual, f);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -923,11 +948,10 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
double f = (double) altCount / ((double) refCount + (double) altCount);
|
return (double) altCount / (refCount + altCount);
|
||||||
return f;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isNonRef(byte refBase, PileupElement p) {
|
private boolean isNonRef(final byte refBase, final PileupElement p) {
|
||||||
return p.getBase() != refBase || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip();
|
return p.getBase() != refBase || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -960,8 +984,8 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
return readsToRemove;
|
return readsToRemove;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static GATKSAMRecord findReadByName(Collection<GATKSAMRecord> reads, String name) {
|
private static GATKSAMRecord findReadByName(final Collection<GATKSAMRecord> reads, final String name) {
|
||||||
for(GATKSAMRecord read : reads) {
|
for(final GATKSAMRecord read : reads) {
|
||||||
if (name.equals(read.getReadName())) return read;
|
if (name.equals(read.getReadName())) return read;
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
|
|
@ -1022,7 +1046,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Integer reduce(List<VariantContext> callsInRegion, Integer numCalledRegions) {
|
public Integer reduce(final List<VariantContext> callsInRegion, final Integer numCalledRegions) {
|
||||||
for( final VariantContext call : callsInRegion ) {
|
for( final VariantContext call : callsInRegion ) {
|
||||||
vcfWriter.add( call );
|
vcfWriter.add( call );
|
||||||
}
|
}
|
||||||
|
|
@ -1030,7 +1054,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void onTraversalDone(Integer result) {
|
public void onTraversalDone(final Integer result) {
|
||||||
// if ( SCAC.emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) ((GVCFWriter)vcfWriter).close(false); // GROSS -- engine forces us to close our own VCF writer since we wrapped it
|
// if ( SCAC.emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) ((GVCFWriter)vcfWriter).close(false); // GROSS -- engine forces us to close our own VCF writer since we wrapped it
|
||||||
// referenceConfidenceModel.close();
|
// referenceConfidenceModel.close();
|
||||||
//TODO remove the need to call close here for debugging, the likelihood output stream should be managed
|
//TODO remove the need to call close here for debugging, the likelihood output stream should be managed
|
||||||
|
|
@ -1045,114 +1069,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); }
|
public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); }
|
||||||
public boolean alwaysAppendDbsnpId() { return false; }
|
public boolean alwaysAppendDbsnpId() { return false; }
|
||||||
|
|
||||||
/**
|
|
||||||
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
|
|
||||||
* dbSNP overlap is only used to require more evidence of absence in the normal if the variant in question has been seen before in germline.
|
|
||||||
*/
|
|
||||||
@ArgumentCollection
|
|
||||||
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
|
|
||||||
public RodBinding<VariantContext> getDbsnpRodBinding() { return dbsnp.dbsnp; }
|
|
||||||
|
|
||||||
/**
|
|
||||||
* If a call overlaps with a record from the provided comp track, the INFO field will be annotated
|
|
||||||
* as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field).
|
|
||||||
* Records that are filtered in the comp track will be ignored.
|
|
||||||
* Note that 'dbSNP' has been special-cased (see the --dbsnp argument).
|
|
||||||
*/
|
|
||||||
@Advanced
|
|
||||||
@Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false)
|
|
||||||
public List<RodBinding<VariantContext>> comps = Collections.emptyList();
|
|
||||||
public List<RodBinding<VariantContext>> getCompRodBindings() { return comps; }
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations.
|
|
||||||
*/
|
|
||||||
@Advanced
|
|
||||||
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
|
|
||||||
// protected List<String> annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"ClippingRankSumTest", "DepthPerSampleHC"}));
|
|
||||||
// protected List<String> annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"DepthPerAlleleBySample", "BaseQualitySumPerAlleleBy ruSample", "TandemRepeatAnnotator",
|
|
||||||
// "RMSMappingQuality","MappingQualityRankSumTest","FisherStrand","StrandOddsRatio","ReadPosRankSumTest","QualByDepth", "Coverage"}));
|
|
||||||
protected List<String> annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"DepthPerAlleleBySample", "BaseQualitySumPerAlleleBySample", "TandemRepeatAnnotator", "OxoGReadCounts"}));
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments,
|
|
||||||
* so annotations will be excluded even if they are explicitly included with the other options.
|
|
||||||
*/
|
|
||||||
@Advanced
|
|
||||||
@Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false)
|
|
||||||
protected List<String> annotationsToExclude = new ArrayList<>(Arrays.asList(new String[]{"SpanningDeletions"}));
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups.
|
|
||||||
*/
|
|
||||||
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
|
|
||||||
//protected String[] annotationGroupsToUse = { StandardAnnotation.class.getSimpleName() };
|
|
||||||
protected String[] annotationClassesToUse = { };
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A raw, unfiltered, highly sensitive callset in VCF format.
|
|
||||||
*/
|
|
||||||
@Output(doc="File to which variants should be written")
|
|
||||||
protected VariantContextWriter vcfWriter = null;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Active region trimmer reference.
|
|
||||||
*/
|
|
||||||
@ArgumentCollection
|
|
||||||
protected ActiveRegionTrimmer trimmer = new ActiveRegionTrimmer();
|
|
||||||
|
|
||||||
@Hidden
|
|
||||||
@Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false)
|
|
||||||
protected String keepRG = null;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The minimum confidence needed for a given base for it to be used in variant calling.
|
|
||||||
*/
|
|
||||||
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false)
|
|
||||||
public byte MIN_BASE_QUALTY_SCORE = 10;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// PAIR-HMM-Related Goodness
|
|
||||||
|
|
||||||
// public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE;
|
|
||||||
// public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.AGGRESSIVE;
|
|
||||||
public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.HOSTILE;
|
|
||||||
|
|
||||||
// Parameters to control read error correction
|
|
||||||
@Hidden
|
|
||||||
@Argument(fullName="errorCorrectReads", shortName="errorCorrectReads", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false)
|
|
||||||
protected boolean errorCorrectReads = false;
|
|
||||||
|
|
||||||
@Hidden
|
|
||||||
@Argument(fullName="captureAssemblyFailureBAM", shortName="captureAssemblyFailureBAM", doc="If specified, we will write a BAM called assemblyFailure.bam capturing all of the reads that were in the active region when the assembler failed for any reason", required = false)
|
|
||||||
protected boolean captureAssemblyFailureBAM = false;
|
|
||||||
|
|
||||||
@Advanced
|
|
||||||
@Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="If specified, we will not analyze soft clipped bases in the reads", required = false)
|
|
||||||
protected boolean dontUseSoftClippedBases = false;
|
|
||||||
|
|
||||||
@Hidden
|
|
||||||
@Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false)
|
|
||||||
protected boolean justDetermineActiveRegions = false;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// reference base padding size
|
|
||||||
private static final int REFERENCE_PADDING = 500;
|
|
||||||
|
|
||||||
private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6;
|
|
||||||
private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument
|
|
||||||
private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* High-level function that runs the assembler on the active region reads,
|
* High-level function that runs the assembler on the active region reads,
|
||||||
* returning a data structure with the resulting information needed
|
* returning a data structure with the resulting information needed
|
||||||
|
|
@ -1224,14 +1140,11 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
if( !clippedRead.isEmpty() && clippedRead.getCigar().getReadLength() > 0 ) {
|
if( !clippedRead.isEmpty() && clippedRead.getCigar().getReadLength() > 0 ) {
|
||||||
clippedRead = ReadClipper.hardClipToRegion(clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop());
|
clippedRead = ReadClipper.hardClipToRegion(clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop());
|
||||||
if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) {
|
if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) {
|
||||||
//logger.info("Keeping read " + clippedRead + " start " + clippedRead.getAlignmentStart() + " end " + clippedRead.getAlignmentEnd());
|
|
||||||
readsToUse.add(clippedRead);
|
readsToUse.add(clippedRead);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO -- Performance optimization: we partition the reads by sample 4 times right now; let's unify that code.
|
|
||||||
|
|
||||||
final List<GATKSAMRecord> downsampledReads = DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart);
|
final List<GATKSAMRecord> downsampledReads = DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart);
|
||||||
|
|
||||||
if ( MTAC.bamWriter != null && MTAC.emitDroppedReads ) {
|
if ( MTAC.bamWriter != null && MTAC.emitDroppedReads ) {
|
||||||
|
|
@ -1268,9 +1181,9 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
*
|
*
|
||||||
* @param reads the list of reads to consider
|
* @param reads the list of reads to consider
|
||||||
*/
|
*/
|
||||||
private void cleanOverlappingReadPairs(final List<GATKSAMRecord> reads, Set<String> normalSampleNames) {
|
private void cleanOverlappingReadPairs(final List<GATKSAMRecord> reads, final Set<String> normalSampleNames) {
|
||||||
Map<String, List<GATKSAMRecord>> data = splitReadsBySample(reads);
|
final Map<String, List<GATKSAMRecord>> data = splitReadsBySample(reads);
|
||||||
for ( String sampleName : data.keySet() ) {
|
for ( final String sampleName : data.keySet() ) {
|
||||||
final boolean isTumor = !normalSampleNames.contains(sampleName);
|
final boolean isTumor = !normalSampleNames.contains(sampleName);
|
||||||
final List<GATKSAMRecord> perSampleReadList = data.get(sampleName);
|
final List<GATKSAMRecord> perSampleReadList = data.get(sampleName);
|
||||||
|
|
||||||
|
|
@ -1284,16 +1197,15 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void logReadInfo(String readName, Collection<GATKSAMRecord> records, String message) {
|
public static void logReadInfo(final String readName, final Collection<GATKSAMRecord> records, final String message) {
|
||||||
if (readName != null) {
|
if (readName != null) {
|
||||||
for (GATKSAMRecord rec : records) {
|
for (final GATKSAMRecord rec : records) {
|
||||||
logReadInfo(readName, rec, message);
|
logReadInfo(readName, rec, message);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void logReadInfo(String readName, GATKSAMRecord rec, String message) {
|
public static void logReadInfo(final String readName, final GATKSAMRecord rec, final String message) {
|
||||||
if (readName != null && rec != null && readName.equals(rec.getReadName())) {
|
if (readName != null && rec != null && readName.equals(rec.getReadName())) {
|
||||||
logger.info("Found " + rec.toString() + " - " + message);
|
logger.info("Found " + rec.toString() + " - " + message);
|
||||||
}
|
}
|
||||||
|
|
@ -1322,7 +1234,7 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isReadFromNormal(GATKSAMRecord rec) {
|
private boolean isReadFromNormal(final GATKSAMRecord rec) {
|
||||||
return normalSampleName != null && normalSampleName.equals(rec.getReadGroup().getSample());
|
return normalSampleName != null && normalSampleName.equals(rec.getReadGroup().getSample());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
@ -1338,7 +1250,6 @@ public class MuTect2 extends ActiveRegionWalker<List<VariantContext>, Integer> i
|
||||||
@Advanced
|
@Advanced
|
||||||
@Argument(fullName="doNotRunPhysicalPhasing", shortName="doNotRunPhysicalPhasing", doc="Disable physical phasing", required = false)
|
@Argument(fullName="doNotRunPhysicalPhasing", shortName="doNotRunPhysicalPhasing", doc="Disable physical phasing", required = false)
|
||||||
protected boolean doNotRunPhysicalPhasing = false;
|
protected boolean doNotRunPhysicalPhasing = false;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -54,9 +54,11 @@ package org.broadinstitute.gatk.tools.walkers.cancer.m2;
|
||||||
import com.google.java.contract.Ensures;
|
import com.google.java.contract.Ensures;
|
||||||
import htsjdk.samtools.util.StringUtil;
|
import htsjdk.samtools.util.StringUtil;
|
||||||
import htsjdk.variant.variantcontext.*;
|
import htsjdk.variant.variantcontext.*;
|
||||||
|
import org.apache.commons.collections.ListUtils;
|
||||||
import org.apache.commons.lang.mutable.MutableDouble;
|
import org.apache.commons.lang.mutable.MutableDouble;
|
||||||
import org.apache.commons.lang.mutable.MutableInt;
|
import org.apache.commons.lang.mutable.MutableInt;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculator;
|
||||||
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvider;
|
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvider;
|
||||||
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCallerGenotypingEngine;
|
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCallerGenotypingEngine;
|
||||||
import org.broadinstitute.gatk.utils.GenomeLoc;
|
import org.broadinstitute.gatk.utils.GenomeLoc;
|
||||||
|
|
@ -78,16 +80,33 @@ import java.util.*;
|
||||||
|
|
||||||
public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
||||||
|
|
||||||
protected final M2ArgumentCollection MTAC;
|
private final M2ArgumentCollection MTAC;
|
||||||
private final TumorPowerCalculator strandArtifactPowerCalculator;
|
private final TumorPowerCalculator strandArtifactPowerCalculator;
|
||||||
final boolean REF_AND_ALT = false;
|
|
||||||
final boolean ALT_ONLY = true;
|
private final String tumorSampleName;
|
||||||
|
private final String matchedNormalSampleName;
|
||||||
|
private final String DEBUG_READ_NAME;
|
||||||
|
|
||||||
|
// {@link GenotypingEngine} requires a non-null {@link AFCalculatorProvider} but this class doesn't need it. Thus we make a dummy
|
||||||
|
private static AFCalculatorProvider DUMMY_AF_CALCULATOR_PROVIDER = new AFCalculatorProvider() {
|
||||||
|
public AFCalculator getInstance(final int ploidy, final int maximumAltAlleles) { return null; }
|
||||||
|
};
|
||||||
|
|
||||||
private final static Logger logger = Logger.getLogger(SomaticGenotypingEngine.class);
|
private final static Logger logger = Logger.getLogger(SomaticGenotypingEngine.class);
|
||||||
|
|
||||||
public SomaticGenotypingEngine(final M2ArgumentCollection configuration, final SampleList samples, final GenomeLocParser genomeLocParser, final AFCalculatorProvider afCalculatorProvider, final boolean doPhysicalPhasing, final M2ArgumentCollection MTAC) {
|
public SomaticGenotypingEngine(final M2ArgumentCollection configuration,
|
||||||
super(configuration, samples, genomeLocParser, afCalculatorProvider, doPhysicalPhasing);
|
final SampleList samples,
|
||||||
|
final GenomeLocParser genomeLocParser,
|
||||||
|
final boolean doPhysicalPhasing,
|
||||||
|
final M2ArgumentCollection MTAC,
|
||||||
|
final String tumorSampleName,
|
||||||
|
final String matchedNormalSampleName,
|
||||||
|
final String DEBUG_READ_NAME) {
|
||||||
|
super(configuration, samples, genomeLocParser, DUMMY_AF_CALCULATOR_PROVIDER, doPhysicalPhasing);
|
||||||
this.MTAC = MTAC;
|
this.MTAC = MTAC;
|
||||||
|
this.tumorSampleName = tumorSampleName;
|
||||||
|
this.matchedNormalSampleName = matchedNormalSampleName;
|
||||||
|
this.DEBUG_READ_NAME = DEBUG_READ_NAME;
|
||||||
// coverage related initialization
|
// coverage related initialization
|
||||||
final double errorProbability = Math.pow(10, -(MTAC.POWER_CONSTANT_QSCORE/10));
|
final double errorProbability = Math.pow(10, -(MTAC.POWER_CONSTANT_QSCORE/10));
|
||||||
strandArtifactPowerCalculator = new TumorPowerCalculator(errorProbability, MTAC.STRAND_ARTIFACT_LOD_THRESHOLD, 0.0f);
|
strandArtifactPowerCalculator = new TumorPowerCalculator(errorProbability, MTAC.STRAND_ARTIFACT_LOD_THRESHOLD, 0.0f);
|
||||||
|
|
@ -105,37 +124,22 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
||||||
* @param ref Reference bytes at active region
|
* @param ref Reference bytes at active region
|
||||||
* @param refLoc Corresponding active region genome location
|
* @param refLoc Corresponding active region genome location
|
||||||
* @param activeRegionWindow Active window
|
* @param activeRegionWindow Active window
|
||||||
* @param genomeLocParser GenomeLocParser
|
|
||||||
* @param activeAllelesToGenotype Alleles to genotype
|
* @param activeAllelesToGenotype Alleles to genotype
|
||||||
* @param emitReferenceConfidence whether we should add a <NON_REF> alternative allele to the result variation contexts.
|
|
||||||
*
|
*
|
||||||
* @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes
|
* @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
// @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
|
|
||||||
@Ensures("result != null")
|
|
||||||
// TODO - can this be refactored? this is hard to follow!
|
// TODO - can this be refactored? this is hard to follow!
|
||||||
public CalledHaplotypes callMutations (
|
public CalledHaplotypes callMutations (
|
||||||
final List<Haplotype> haplotypes,
|
final List<Haplotype> haplotypes,
|
||||||
//final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
|
final ReadLikelihoods<Haplotype> readLikelihoods,
|
||||||
final ReadLikelihoods<Haplotype> readLikelihoods,
|
final Map<String, Integer> originalNormalReadQualities,
|
||||||
final Map<String, Integer> originalNormalReadQualities,
|
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
|
||||||
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
|
final byte[] ref,
|
||||||
final byte[] ref,
|
final GenomeLoc refLoc,
|
||||||
final GenomeLoc refLoc,
|
final GenomeLoc activeRegionWindow,
|
||||||
final GenomeLoc activeRegionWindow,
|
final RefMetaDataTracker tracker,
|
||||||
final GenomeLocParser genomeLocParser,
|
final List<VariantContext> activeAllelesToGenotype) {
|
||||||
final RefMetaDataTracker tracker,
|
|
||||||
final List<VariantContext> activeAllelesToGenotype,
|
|
||||||
final boolean emitReferenceConfidence,
|
|
||||||
final String tumorSampleName,
|
|
||||||
final String matchedNormalSampleName,
|
|
||||||
final RodBinding<VariantContext> dbsnpRod,
|
|
||||||
final List<RodBinding<VariantContext>> cosmicRod,
|
|
||||||
final String DEBUG_READ_NAME
|
|
||||||
|
|
||||||
) {
|
|
||||||
|
|
||||||
// sanity check input arguments
|
// sanity check input arguments
|
||||||
if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes);
|
if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes);
|
||||||
if (readLikelihoods == null || readLikelihoods.sampleCount() == 0) throw new IllegalArgumentException("readLikelihoods input should be non-empty and non-null, got "+readLikelihoods);
|
if (readLikelihoods == null || readLikelihoods.sampleCount() == 0) throw new IllegalArgumentException("readLikelihoods input should be non-empty and non-null, got "+readLikelihoods);
|
||||||
|
|
@ -143,8 +147,6 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
||||||
if (refLoc == null || refLoc.size() != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc);
|
if (refLoc == null || refLoc.size() != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc);
|
||||||
if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow);
|
if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow);
|
||||||
if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype);
|
if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype);
|
||||||
if (genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser must be non-null, got "+genomeLocParser);
|
|
||||||
|
|
||||||
|
|
||||||
// Somatic Tumor/Normal Sample Handling
|
// Somatic Tumor/Normal Sample Handling
|
||||||
verifySamplePresence(tumorSampleName, readLikelihoods.samples());
|
verifySamplePresence(tumorSampleName, readLikelihoods.samples());
|
||||||
|
|
@ -159,293 +161,217 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
||||||
final List<VariantContext> returnCalls = new ArrayList<>();
|
final List<VariantContext> returnCalls = new ArrayList<>();
|
||||||
|
|
||||||
for( final int loc : startPosKeySet ) {
|
for( final int loc : startPosKeySet ) {
|
||||||
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region
|
if( loc < activeRegionWindow.getStart() || loc > activeRegionWindow.getStop() ) {
|
||||||
final List<VariantContext> eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype);
|
continue;
|
||||||
|
|
||||||
if( eventsAtThisLoc.isEmpty() ) { continue; }
|
|
||||||
|
|
||||||
// Create the event mapping object which maps the original haplotype events to the events present at just this locus
|
|
||||||
final Map<Event, List<Haplotype>> eventMapper = createEventMapper(loc, eventsAtThisLoc, haplotypes);
|
|
||||||
|
|
||||||
// Sanity check the priority list for mistakes
|
|
||||||
final List<String> priorityList = makePriorityList(eventsAtThisLoc);
|
|
||||||
|
|
||||||
// Merge the event to find a common reference representation
|
|
||||||
|
|
||||||
VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList,
|
|
||||||
GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
|
|
||||||
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
|
|
||||||
|
|
||||||
if( mergedVC == null ) { continue; }
|
|
||||||
|
|
||||||
if (emitReferenceConfidence)
|
|
||||||
mergedVC = addNonRefSymbolicAllele(mergedVC);
|
|
||||||
|
|
||||||
final Map<VariantContext, Allele> mergeMap = new LinkedHashMap<>();
|
|
||||||
mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele
|
|
||||||
for(int iii = 0; iii < eventsAtThisLoc.size(); iii++) {
|
|
||||||
mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function
|
|
||||||
}
|
|
||||||
|
|
||||||
final Map<Allele, List<Haplotype>> alleleMapper = createAlleleMapper(mergeMap, eventMapper);
|
|
||||||
|
|
||||||
if( configuration.DEBUG && logger != null ) {
|
|
||||||
if (logger != null) logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
|
|
||||||
}
|
|
||||||
|
|
||||||
ReadLikelihoods<Allele> readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION));
|
|
||||||
|
|
||||||
//LDG: do we want to do this before or after pulling out overlapping reads?
|
|
||||||
if (MTAC.isSampleContaminationPresent())
|
|
||||||
readAlleleLikelihoods.contaminationDownsampling(MTAC.getSampleContamination());
|
|
||||||
|
|
||||||
//if (!mergedVC.isBiallelic()) {
|
|
||||||
// logger.info("[UNSUPPORTED] Detected non-Biallelic VC" + mergedVC.toString());
|
|
||||||
// continue;
|
|
||||||
//}
|
|
||||||
|
|
||||||
// TODO: once tests are passing, refactor to use the new data structure (not the deprecated one)
|
|
||||||
// handle overlapping fragments
|
|
||||||
// TODO: CONFIRM WITH GSA IF IT IS OK TO REMOVE READS FROM THE PRALM (should be... they do it in filterPoorlyModeledReads!)
|
|
||||||
PerReadAlleleLikelihoodMap tumorPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(tumorSampleName));
|
|
||||||
filterPRALMForOverlappingReads(tumorPRALM, mergedVC.getReference(), loc, false);
|
|
||||||
MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in Tumor PRALM after filtering for overlapping reads");
|
|
||||||
// extend to multiple samples
|
|
||||||
|
|
||||||
// compute tumor LOD for each alternate allele
|
|
||||||
// TODO: somewhere we have to ensure that the all the alleles in the variant context is in alleleFractions passed to getHetGenotypeLogLikelihoods. getHetGenotypeLogLikelihoods will not check that for you
|
|
||||||
final PerAlleleCollection<Double> altAlleleFractions = estimateAlleleFraction(mergedVC, tumorPRALM, false);
|
|
||||||
final PerAlleleCollection<Double> tumorHetGenotypeLLs = getHetGenotypeLogLikelihoods(mergedVC, tumorPRALM, originalNormalReadQualities, altAlleleFractions);
|
|
||||||
|
|
||||||
if( configuration.DEBUG && logger != null ) {
|
|
||||||
StringBuilder outputSB = new StringBuilder("Calculated allelic fraction at " + loc + " = [");
|
|
||||||
for (Allele allele : altAlleleFractions.getAltAlleles()){
|
|
||||||
outputSB.append( allele + ": " + altAlleleFractions.getAlt(allele) + ", ");
|
|
||||||
}
|
|
||||||
outputSB.append("]");
|
|
||||||
logger.info(outputSB.toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
final PerAlleleCollection<Double> tumorLods = PerAlleleCollection.createPerAltAlleleCollection();
|
|
||||||
for (Allele altAllele : mergedVC.getAlternateAlleles()){
|
|
||||||
tumorLods.set(altAllele, tumorHetGenotypeLLs.get(altAllele) - tumorHetGenotypeLLs.getRef());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (configuration.DEBUG && logger != null) {
|
|
||||||
StringBuilder outputSB = new StringBuilder("Tumor LOD at " + loc + " = [");
|
|
||||||
for (Allele altAllele : tumorLods.getAltAlleles()) {
|
|
||||||
outputSB.append( altAllele + ": " + tumorLods.getAlt(altAllele) + ", ");
|
|
||||||
}
|
|
||||||
outputSB.append("]");
|
|
||||||
logger.info(outputSB.toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
double INIT_NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE;
|
|
||||||
double NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE;
|
|
||||||
PerReadAlleleLikelihoodMap normalPRALM = null;
|
|
||||||
PerAlleleCollection<Double> normalLods = PerAlleleCollection.createPerAltAlleleCollection();
|
|
||||||
|
|
||||||
// if normal bam is available, compute normal LOD
|
|
||||||
if (hasNormal) {
|
|
||||||
normalPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(matchedNormalSampleName));
|
|
||||||
filterPRALMForOverlappingReads(normalPRALM, mergedVC.getReference(), loc, true);
|
|
||||||
MuTect2.logReadInfo(DEBUG_READ_NAME, normalPRALM.getLikelihoodReadMap().keySet(), "Present after in Nomral PRALM filtering for overlapping reads");
|
|
||||||
|
|
||||||
GenomeLoc eventGenomeLoc = genomeLocParser.createGenomeLoc(activeRegionWindow.getContig(), loc);
|
|
||||||
Collection<VariantContext> cosmicVC = tracker.getValues(cosmicRod, eventGenomeLoc);
|
|
||||||
Collection<VariantContext> dbsnpVC = tracker.getValues(dbsnpRod, eventGenomeLoc);
|
|
||||||
// remove the effect of cosmic from dbSNP
|
|
||||||
final boolean germlineAtRisk = (!dbsnpVC.isEmpty() && cosmicVC.isEmpty());
|
|
||||||
|
|
||||||
INIT_NORMAL_LOD_THRESHOLD = MTAC.INITIAL_NORMAL_LOD_THRESHOLD; //only set this if this job has a normal
|
|
||||||
NORMAL_LOD_THRESHOLD = (germlineAtRisk)?MTAC.NORMAL_DBSNP_LOD_THRESHOLD:MTAC.NORMAL_LOD_THRESHOLD;
|
|
||||||
|
|
||||||
|
|
||||||
// compute normal LOD = LL(X|REF)/LL(X|ALT) where ALT is the diploid HET with AF = 0.5
|
|
||||||
// note normal LOD is REF over ALT, the reciprocal of the tumor LOD
|
|
||||||
final PerAlleleCollection<Double> diploidHetAlleleFractions = PerAlleleCollection.createPerRefAndAltAlleleCollection();
|
|
||||||
for (final Allele allele : mergedVC.getAlternateAlleles()){
|
|
||||||
diploidHetAlleleFractions.setAlt(allele, 0.5);
|
|
||||||
}
|
|
||||||
|
|
||||||
final PerAlleleCollection<Double> normalGenotypeLLs = getHetGenotypeLogLikelihoods(mergedVC, normalPRALM, originalNormalReadQualities, diploidHetAlleleFractions);
|
|
||||||
|
|
||||||
for (final Allele altAllele : mergedVC.getAlternateAlleles()){
|
|
||||||
normalLods.setAlt(altAllele, normalGenotypeLLs.getRef() - normalGenotypeLLs.getAlt(altAllele));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
int numPassingAlts = 0;
|
|
||||||
Set<Allele> allelesThatPassThreshold = new HashSet<>();
|
|
||||||
Allele alleleWithHighestTumorLOD = null;
|
|
||||||
|
|
||||||
// TODO: use lambda
|
|
||||||
for (Allele altAllele : mergedVC.getAlternateAlleles()) {
|
|
||||||
final boolean passesTumorLodThreshold = tumorLods.getAlt(altAllele) >= MTAC.INITIAL_TUMOR_LOD_THRESHOLD;
|
|
||||||
final boolean passesNormalLodThreshold = hasNormal ? normalLods.getAlt(altAllele) >= INIT_NORMAL_LOD_THRESHOLD : true;
|
|
||||||
if (passesTumorLodThreshold && passesNormalLodThreshold) {
|
|
||||||
numPassingAlts++;
|
|
||||||
allelesThatPassThreshold.add(altAllele);
|
|
||||||
if (alleleWithHighestTumorLOD == null
|
|
||||||
|| tumorLods.getAlt(altAllele) > tumorLods.getAlt(alleleWithHighestTumorLOD)){
|
|
||||||
alleleWithHighestTumorLOD = altAllele;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
final boolean emitVariant = numPassingAlts > 0;
|
|
||||||
|
|
||||||
VariantContext call = null;
|
|
||||||
if (emitVariant) {
|
|
||||||
VariantContextBuilder callVcb = new VariantContextBuilder(mergedVC);
|
|
||||||
// FIXME: can simply get first alternate since above we only deal with Bi-allelic sites...
|
|
||||||
int haplotypeCount = alleleMapper.get(mergedVC.getAlternateAllele(0)).size();
|
|
||||||
callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount);
|
|
||||||
callVcb.attribute(GATKVCFConstants.TUMOR_LOD_KEY, tumorLods.getAlt(alleleWithHighestTumorLOD));
|
|
||||||
|
|
||||||
if (hasNormal) {
|
|
||||||
callVcb.attribute(GATKVCFConstants.NORMAL_LOD_KEY, normalLods.getAlt(alleleWithHighestTumorLOD));
|
|
||||||
if (normalLods.getAlt(alleleWithHighestTumorLOD) < NORMAL_LOD_THRESHOLD) {
|
|
||||||
callVcb.filter(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// M1-style strand artifact filter
|
|
||||||
// TODO: move code to MuTect2::calculateFilters()
|
|
||||||
// skip if VC has multiple alleles - it will get filtered later anyway
|
|
||||||
if (MTAC.ENABLE_STRAND_ARTIFACT_FILTER && numPassingAlts == 1) {
|
|
||||||
final PerReadAlleleLikelihoodMap forwardPRALM = new PerReadAlleleLikelihoodMap();
|
|
||||||
final PerReadAlleleLikelihoodMap reversePRALM = new PerReadAlleleLikelihoodMap();
|
|
||||||
splitPRALMintoForwardAndReverseReads(tumorPRALM, forwardPRALM, reversePRALM);
|
|
||||||
|
|
||||||
MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in tumor PRALM after PRALM is split");
|
|
||||||
MuTect2.logReadInfo(DEBUG_READ_NAME, forwardPRALM.getLikelihoodReadMap().keySet(), "Present in forward PRALM after PRALM is split");
|
|
||||||
MuTect2.logReadInfo(DEBUG_READ_NAME, reversePRALM.getLikelihoodReadMap().keySet(), "Present in reverse PRALM after PRALM is split");
|
|
||||||
|
|
||||||
// TODO: build a new type for probability, likelihood, and log_likelihood. e.g. f_fwd :: probability[], tumorGLs_fwd :: likelihood[]
|
|
||||||
// TODO: don't want to call getHetGenotypeLogLikelihoods on more than one alternate alelle. May need to overload it to take a scalar f_fwd.
|
|
||||||
final PerAlleleCollection<Double> alleleFractionsForward = estimateAlleleFraction(mergedVC, forwardPRALM, true);
|
|
||||||
final PerAlleleCollection<Double> tumorGenotypeLLForward = getHetGenotypeLogLikelihoods(mergedVC, forwardPRALM, originalNormalReadQualities, alleleFractionsForward);
|
|
||||||
|
|
||||||
final PerAlleleCollection<Double> alleleFractionsReverse = estimateAlleleFraction(mergedVC, reversePRALM, true);
|
|
||||||
final PerAlleleCollection<Double> tumorGenotypeLLReverse = getHetGenotypeLogLikelihoods(mergedVC, reversePRALM, originalNormalReadQualities, alleleFractionsReverse);
|
|
||||||
|
|
||||||
if( configuration.DEBUG && logger != null ) {
|
|
||||||
StringBuilder forwardMessage = new StringBuilder("Calculated forward allelic fraction at " + loc + " = [");
|
|
||||||
StringBuilder reverseMessage = new StringBuilder("Calculated reverse allelic fraction at " + loc + " = [");
|
|
||||||
|
|
||||||
for (Allele altAllele : altAlleleFractions.getAltAlleles()){
|
|
||||||
forwardMessage.append( altAllele + ": " + alleleFractionsForward.getAlt(altAllele) + ", ");
|
|
||||||
reverseMessage.append( altAllele + ": " + alleleFractionsReverse.getAlt(altAllele) + ", ");
|
|
||||||
}
|
|
||||||
|
|
||||||
forwardMessage.append("]");
|
|
||||||
reverseMessage.append("]");
|
|
||||||
|
|
||||||
logger.info(forwardMessage.toString());
|
|
||||||
logger.info(reverseMessage.toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
double tumorLod_fwd = tumorGenotypeLLForward.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLForward.getRef();
|
|
||||||
double tumorLod_rev = tumorGenotypeLLReverse.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLReverse.getRef();
|
|
||||||
|
|
||||||
double tumorSBpower_fwd = 0.0;
|
|
||||||
double tumorSBpower_rev = 0.0;
|
|
||||||
try {
|
|
||||||
// Note that we use the observed combined (+ and -) allele fraction for power calculation in either direction
|
|
||||||
tumorSBpower_fwd = strandArtifactPowerCalculator.cachedPowerCalculation(forwardPRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD));
|
|
||||||
tumorSBpower_rev = strandArtifactPowerCalculator.cachedPowerCalculation(reversePRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD));
|
|
||||||
}
|
|
||||||
catch (Throwable t) {
|
|
||||||
System.err.println("Error processing " + activeRegionWindow.getContig() + ":" + loc);
|
|
||||||
t.printStackTrace(System.err);
|
|
||||||
throw new RuntimeException(t);
|
|
||||||
}
|
|
||||||
|
|
||||||
callVcb.attribute(GATKVCFConstants.TLOD_FWD_KEY, tumorLod_fwd);
|
|
||||||
callVcb.attribute(GATKVCFConstants.TLOD_REV_KEY, tumorLod_rev);
|
|
||||||
callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_FWD_KEY, tumorSBpower_fwd);
|
|
||||||
callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_REV_KEY, tumorSBpower_rev);
|
|
||||||
// TODO: add vcf INFO fields. see callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount);
|
|
||||||
|
|
||||||
if ((tumorSBpower_fwd > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_fwd < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD) ||
|
|
||||||
(tumorSBpower_rev > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_rev < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD))
|
|
||||||
callVcb.filter(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME);
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: this probably belongs in M2::calculateFilters()
|
|
||||||
if (numPassingAlts > 1) {
|
|
||||||
callVcb.filter(GATKVCFConstants.TRIALLELIC_SITE_FILTER_NAME);
|
|
||||||
}
|
|
||||||
|
|
||||||
// build genotypes TODO: this part needs review and refactor
|
|
||||||
List<Allele> tumorAlleles = new ArrayList<>();
|
|
||||||
tumorAlleles.add(mergedVC.getReference());
|
|
||||||
tumorAlleles.add(alleleWithHighestTumorLOD);
|
|
||||||
Genotype tumorGenotype = new GenotypeBuilder(tumorSampleName, tumorAlleles)
|
|
||||||
.attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, altAlleleFractions.getAlt(alleleWithHighestTumorLOD))
|
|
||||||
.make(); // TODO: add ADs?
|
|
||||||
List<Genotype> genotypes = new ArrayList<>();
|
|
||||||
genotypes.add(tumorGenotype);
|
|
||||||
|
|
||||||
// We assume that the genotype in the normal is 0/0
|
|
||||||
// TODO: is normal always homozygous reference?
|
|
||||||
List<Allele> homRefAllelesforNormalGenotype = new ArrayList<>();
|
|
||||||
homRefAllelesforNormalGenotype.addAll(Collections.nCopies(2, mergedVC.getReference()));
|
|
||||||
|
|
||||||
// if we are calling with a normal, build the genotype for the sample to appear in vcf
|
|
||||||
int REF = 0, ALT = 1;
|
|
||||||
if (hasNormal) {
|
|
||||||
PerAlleleCollection<Integer> normalCounts = getRefAltCount(mergedVC, normalPRALM, false);
|
|
||||||
final int normalRefAlleleDepth = normalCounts.getRef();
|
|
||||||
final int normalAltAlleleDepth = normalCounts.getAlt(alleleWithHighestTumorLOD);
|
|
||||||
final int[] normalAlleleDepths = { normalRefAlleleDepth, normalAltAlleleDepth };
|
|
||||||
final double normalAlleleFraction = (double) normalAltAlleleDepth / ( normalRefAlleleDepth + normalAltAlleleDepth);
|
|
||||||
|
|
||||||
final Genotype normalGenotype = new GenotypeBuilder(matchedNormalSampleName, homRefAllelesforNormalGenotype)
|
|
||||||
.AD(normalAlleleDepths)
|
|
||||||
.attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, normalAlleleFraction)
|
|
||||||
.make();
|
|
||||||
genotypes.add(normalGenotype);
|
|
||||||
}
|
|
||||||
|
|
||||||
//only use alleles found in the tumor (
|
|
||||||
call = new VariantContextBuilder(callVcb).alleles(tumorAlleles).genotypes(genotypes).make();
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
// how should we be making use of _perSampleFilteredReadList_?
|
|
||||||
if( call != null ) {
|
|
||||||
readAlleleLikelihoods = prepareReadAlleleLikelihoodsForAnnotation(readLikelihoods, perSampleFilteredReadList,
|
|
||||||
genomeLocParser, emitReferenceConfidence, alleleMapper, readAlleleLikelihoods, call);
|
|
||||||
|
|
||||||
ReferenceContext referenceContext = new ReferenceContext(genomeLocParser, genomeLocParser.createGenomeLoc(mergedVC.getChr(), mergedVC.getStart(), mergedVC.getEnd()), refLoc, ref);
|
|
||||||
VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(referenceContext, tracker, readAlleleLikelihoods, call, false);
|
|
||||||
|
|
||||||
if( call.getAlleles().size() != mergedVC.getAlleles().size() )
|
|
||||||
annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall);
|
|
||||||
|
|
||||||
// maintain the set of all called haplotypes
|
|
||||||
for ( final Allele calledAllele : call.getAlleles() ) {
|
|
||||||
final List<Haplotype> haplotypeList = alleleMapper.get(calledAllele);
|
|
||||||
if (haplotypeList == null) continue;
|
|
||||||
calledHaplotypes.addAll(haplotypeList);
|
|
||||||
}
|
|
||||||
|
|
||||||
returnCalls.add( annotatedCall );
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
final List<VariantContext> eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype);
|
||||||
|
|
||||||
|
if( eventsAtThisLoc.isEmpty() ) { continue; }
|
||||||
|
|
||||||
|
// Create the event mapping object which maps the original haplotype events to the events present at just this locus
|
||||||
|
final Map<Event, List<Haplotype>> eventMapper = createEventMapper(loc, eventsAtThisLoc, haplotypes);
|
||||||
|
|
||||||
|
final List<String> priorityList = makePriorityList(eventsAtThisLoc);
|
||||||
|
|
||||||
|
VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList,
|
||||||
|
GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
|
||||||
|
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
|
||||||
|
|
||||||
|
if( mergedVC == null ) { continue; }
|
||||||
|
|
||||||
|
final Map<VariantContext, Allele> mergeMap = new LinkedHashMap<>();
|
||||||
|
mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele
|
||||||
|
for(int iii = 0; iii < eventsAtThisLoc.size(); iii++) {
|
||||||
|
mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function
|
||||||
|
}
|
||||||
|
|
||||||
|
final Map<Allele, List<Haplotype>> alleleMapper = createAlleleMapper(mergeMap, eventMapper);
|
||||||
|
|
||||||
|
ReadLikelihoods<Allele> readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION));
|
||||||
|
|
||||||
|
//LDG: do we want to do this before or after pulling out overlapping reads?
|
||||||
|
if (MTAC.isSampleContaminationPresent())
|
||||||
|
readAlleleLikelihoods.contaminationDownsampling(MTAC.getSampleContamination());
|
||||||
|
|
||||||
|
final PerReadAlleleLikelihoodMap tumorPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(tumorSampleName));
|
||||||
|
filterPRALMForOverlappingReads(tumorPRALM, mergedVC.getReference(), loc, false);
|
||||||
|
MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in Tumor PRALM after filtering for overlapping reads");
|
||||||
|
// extend to multiple samples
|
||||||
|
|
||||||
|
// compute tumor LOD for each alternate allele
|
||||||
|
// TODO: somewhere we have to ensure that the all the alleles in the variant context is in alleleFractions passed to getHetGenotypeLogLikelihoods. getHetGenotypeLogLikelihoods will not check that for you
|
||||||
|
final PerAlleleCollection<Double> altAlleleFractions = estimateAlleleFraction(mergedVC, tumorPRALM, false);
|
||||||
|
final PerAlleleCollection<Double> tumorHetGenotypeLLs = getHetGenotypeLogLikelihoods(mergedVC, tumorPRALM, originalNormalReadQualities, altAlleleFractions);
|
||||||
|
|
||||||
|
final PerAlleleCollection<Double> tumorLods = PerAlleleCollection.createPerAltAlleleCollection();
|
||||||
|
for (final Allele altAllele : mergedVC.getAlternateAlleles()){
|
||||||
|
tumorLods.set(altAllele, tumorHetGenotypeLLs.get(altAllele) - tumorHetGenotypeLLs.getRef());
|
||||||
|
}
|
||||||
|
|
||||||
|
double INIT_NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE;
|
||||||
|
double NORMAL_LOD_THRESHOLD = -Double.MAX_VALUE;
|
||||||
|
PerReadAlleleLikelihoodMap normalPRALM = null;
|
||||||
|
final PerAlleleCollection<Double> normalLods = PerAlleleCollection.createPerAltAlleleCollection();
|
||||||
|
|
||||||
|
// if normal bam is available, compute normal LOD
|
||||||
|
if (hasNormal) {
|
||||||
|
normalPRALM = readAlleleLikelihoods.toPerReadAlleleLikelihoodMap(readAlleleLikelihoods.sampleIndex(matchedNormalSampleName));
|
||||||
|
filterPRALMForOverlappingReads(normalPRALM, mergedVC.getReference(), loc, true);
|
||||||
|
MuTect2.logReadInfo(DEBUG_READ_NAME, normalPRALM.getLikelihoodReadMap().keySet(), "Present after in Nomral PRALM filtering for overlapping reads");
|
||||||
|
|
||||||
|
final GenomeLoc eventGenomeLoc = genomeLocParser.createGenomeLoc(activeRegionWindow.getContig(), loc);
|
||||||
|
final Collection<VariantContext> cosmicVC = tracker.getValues(MTAC.cosmicRod, eventGenomeLoc);
|
||||||
|
final Collection<VariantContext> dbsnpVC = tracker.getValues(MTAC.dbsnp.dbsnp, eventGenomeLoc);
|
||||||
|
// remove the effect of cosmic from dbSNP
|
||||||
|
final boolean germlineAtRisk = (!dbsnpVC.isEmpty() && cosmicVC.isEmpty());
|
||||||
|
|
||||||
|
INIT_NORMAL_LOD_THRESHOLD = MTAC.INITIAL_NORMAL_LOD_THRESHOLD; //only set this if this job has a normal
|
||||||
|
NORMAL_LOD_THRESHOLD = (germlineAtRisk)?MTAC.NORMAL_DBSNP_LOD_THRESHOLD:MTAC.NORMAL_LOD_THRESHOLD;
|
||||||
|
|
||||||
|
// compute normal LOD = LL(X|REF)/LL(X|ALT) where ALT is the diploid HET with AF = 0.5
|
||||||
|
// note normal LOD is REF over ALT, the reciprocal of the tumor LOD
|
||||||
|
final PerAlleleCollection<Double> diploidHetAlleleFractions = PerAlleleCollection.createPerRefAndAltAlleleCollection();
|
||||||
|
for (final Allele allele : mergedVC.getAlternateAlleles()){
|
||||||
|
diploidHetAlleleFractions.setAlt(allele, 0.5);
|
||||||
|
}
|
||||||
|
|
||||||
|
final PerAlleleCollection<Double> normalGenotypeLLs = getHetGenotypeLogLikelihoods(mergedVC, normalPRALM, originalNormalReadQualities, diploidHetAlleleFractions);
|
||||||
|
|
||||||
|
for (final Allele altAllele : mergedVC.getAlternateAlleles()){
|
||||||
|
normalLods.setAlt(altAllele, normalGenotypeLLs.getRef() - normalGenotypeLLs.getAlt(altAllele));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int numPassingAlts = 0;
|
||||||
|
final Set<Allele> allelesThatPassThreshold = new HashSet<>();
|
||||||
|
Allele alleleWithHighestTumorLOD = null;
|
||||||
|
|
||||||
|
// TODO: use lambda
|
||||||
|
for (final Allele altAllele : mergedVC.getAlternateAlleles()) {
|
||||||
|
final boolean passesTumorLodThreshold = tumorLods.getAlt(altAllele) >= MTAC.INITIAL_TUMOR_LOD_THRESHOLD;
|
||||||
|
final boolean passesNormalLodThreshold = hasNormal ? normalLods.getAlt(altAllele) >= INIT_NORMAL_LOD_THRESHOLD : true;
|
||||||
|
if (passesTumorLodThreshold && passesNormalLodThreshold) {
|
||||||
|
numPassingAlts++;
|
||||||
|
allelesThatPassThreshold.add(altAllele);
|
||||||
|
if (alleleWithHighestTumorLOD == null
|
||||||
|
|| tumorLods.getAlt(altAllele) > tumorLods.getAlt(alleleWithHighestTumorLOD)){
|
||||||
|
alleleWithHighestTumorLOD = altAllele;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (numPassingAlts == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
final VariantContextBuilder callVcb = new VariantContextBuilder(mergedVC);
|
||||||
|
// FIXME: can simply get first alternate since above we only deal with Bi-allelic sites...
|
||||||
|
final int haplotypeCount = alleleMapper.get(mergedVC.getAlternateAllele(0)).size();
|
||||||
|
callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount);
|
||||||
|
callVcb.attribute(GATKVCFConstants.TUMOR_LOD_KEY, tumorLods.getAlt(alleleWithHighestTumorLOD));
|
||||||
|
|
||||||
|
if (hasNormal) {
|
||||||
|
callVcb.attribute(GATKVCFConstants.NORMAL_LOD_KEY, normalLods.getAlt(alleleWithHighestTumorLOD));
|
||||||
|
if (normalLods.getAlt(alleleWithHighestTumorLOD) < NORMAL_LOD_THRESHOLD) {
|
||||||
|
callVcb.filter(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// M1-style strand artifact filter
|
||||||
|
// TODO: move code to MuTect2::calculateFilters()
|
||||||
|
// skip if VC has multiple alleles - it will get filtered later anyway
|
||||||
|
if (MTAC.ENABLE_STRAND_ARTIFACT_FILTER && numPassingAlts == 1) {
|
||||||
|
final PerReadAlleleLikelihoodMap forwardPRALM = new PerReadAlleleLikelihoodMap();
|
||||||
|
final PerReadAlleleLikelihoodMap reversePRALM = new PerReadAlleleLikelihoodMap();
|
||||||
|
splitPRALMintoForwardAndReverseReads(tumorPRALM, forwardPRALM, reversePRALM);
|
||||||
|
|
||||||
|
MuTect2.logReadInfo(DEBUG_READ_NAME, tumorPRALM.getLikelihoodReadMap().keySet(), "Present in tumor PRALM after PRALM is split");
|
||||||
|
MuTect2.logReadInfo(DEBUG_READ_NAME, forwardPRALM.getLikelihoodReadMap().keySet(), "Present in forward PRALM after PRALM is split");
|
||||||
|
MuTect2.logReadInfo(DEBUG_READ_NAME, reversePRALM.getLikelihoodReadMap().keySet(), "Present in reverse PRALM after PRALM is split");
|
||||||
|
|
||||||
|
// TODO: build a new type for probability, likelihood, and log_likelihood. e.g. f_fwd :: probability[], tumorGLs_fwd :: likelihood[]
|
||||||
|
// TODO: don't want to call getHetGenotypeLogLikelihoods on more than one alternate alelle. May need to overload it to take a scalar f_fwd.
|
||||||
|
final PerAlleleCollection<Double> alleleFractionsForward = estimateAlleleFraction(mergedVC, forwardPRALM, true);
|
||||||
|
final PerAlleleCollection<Double> tumorGenotypeLLForward = getHetGenotypeLogLikelihoods(mergedVC, forwardPRALM, originalNormalReadQualities, alleleFractionsForward);
|
||||||
|
|
||||||
|
final PerAlleleCollection<Double> alleleFractionsReverse = estimateAlleleFraction(mergedVC, reversePRALM, true);
|
||||||
|
final PerAlleleCollection<Double> tumorGenotypeLLReverse = getHetGenotypeLogLikelihoods(mergedVC, reversePRALM, originalNormalReadQualities, alleleFractionsReverse);
|
||||||
|
|
||||||
|
final double tumorLod_fwd = tumorGenotypeLLForward.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLForward.getRef();
|
||||||
|
final double tumorLod_rev = tumorGenotypeLLReverse.getAlt(alleleWithHighestTumorLOD) - tumorGenotypeLLReverse.getRef();
|
||||||
|
|
||||||
|
// Note that we use the observed combined (+ and -) allele fraction for power calculation in either direction
|
||||||
|
final double tumorSBpower_fwd = strandArtifactPowerCalculator.cachedPowerCalculation(forwardPRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD));
|
||||||
|
final double tumorSBpower_rev = strandArtifactPowerCalculator.cachedPowerCalculation(reversePRALM.getNumberOfStoredElements(), altAlleleFractions.getAlt(alleleWithHighestTumorLOD));
|
||||||
|
|
||||||
|
|
||||||
|
callVcb.attribute(GATKVCFConstants.TLOD_FWD_KEY, tumorLod_fwd);
|
||||||
|
callVcb.attribute(GATKVCFConstants.TLOD_REV_KEY, tumorLod_rev);
|
||||||
|
callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_FWD_KEY, tumorSBpower_fwd);
|
||||||
|
callVcb.attribute(GATKVCFConstants.TUMOR_SB_POWER_REV_KEY, tumorSBpower_rev);
|
||||||
|
// TODO: add vcf INFO fields. see callVcb.attribute(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, haplotypeCount);
|
||||||
|
|
||||||
|
if ((tumorSBpower_fwd > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_fwd < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD) ||
|
||||||
|
(tumorSBpower_rev > MTAC.STRAND_ARTIFACT_POWER_THRESHOLD && tumorLod_rev < MTAC.STRAND_ARTIFACT_LOD_THRESHOLD))
|
||||||
|
callVcb.filter(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: this probably belongs in M2::calculateFilters()
|
||||||
|
if (numPassingAlts > 1) {
|
||||||
|
callVcb.filter(GATKVCFConstants.TRIALLELIC_SITE_FILTER_NAME);
|
||||||
|
}
|
||||||
|
|
||||||
|
// build genotypes TODO: this part needs review and refactor
|
||||||
|
final List<Allele> tumorAlleles = Arrays.asList(mergedVC.getReference(), alleleWithHighestTumorLOD);
|
||||||
|
final Genotype tumorGenotype = new GenotypeBuilder(tumorSampleName, tumorAlleles)
|
||||||
|
.attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, altAlleleFractions.getAlt(alleleWithHighestTumorLOD))
|
||||||
|
.make(); // TODO: add ADs?
|
||||||
|
final List<Genotype> genotypes = new ArrayList<>();
|
||||||
|
genotypes.add(tumorGenotype);
|
||||||
|
|
||||||
|
// We assume that the genotype in the normal is 0/0
|
||||||
|
// TODO: is normal always homozygous reference?
|
||||||
|
final List<Allele> homRefAllelesforNormalGenotype = Collections.nCopies(2, mergedVC.getReference());
|
||||||
|
|
||||||
|
// if we are calling with a normal, build the genotype for the sample to appear in vcf
|
||||||
|
if (hasNormal) {
|
||||||
|
final PerAlleleCollection<Integer> normalCounts = getRefAltCount(mergedVC, normalPRALM, false);
|
||||||
|
final int normalRefAlleleDepth = normalCounts.getRef();
|
||||||
|
final int normalAltAlleleDepth = normalCounts.getAlt(alleleWithHighestTumorLOD);
|
||||||
|
final int[] normalAlleleDepths = { normalRefAlleleDepth, normalAltAlleleDepth };
|
||||||
|
final double normalAlleleFraction = (double) normalAltAlleleDepth / ( normalRefAlleleDepth + normalAltAlleleDepth);
|
||||||
|
|
||||||
|
final Genotype normalGenotype = new GenotypeBuilder(matchedNormalSampleName, homRefAllelesforNormalGenotype)
|
||||||
|
.AD(normalAlleleDepths)
|
||||||
|
.attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, normalAlleleFraction)
|
||||||
|
.make();
|
||||||
|
genotypes.add(normalGenotype);
|
||||||
|
}
|
||||||
|
|
||||||
|
//only use alleles found in the tumor (
|
||||||
|
final VariantContext call = new VariantContextBuilder(callVcb).alleles(tumorAlleles).genotypes(genotypes).make();
|
||||||
|
|
||||||
|
// how should we be making use of _perSampleFilteredReadList_?
|
||||||
|
readAlleleLikelihoods = prepareReadAlleleLikelihoodsForAnnotation(readLikelihoods, perSampleFilteredReadList,
|
||||||
|
genomeLocParser, false, alleleMapper, readAlleleLikelihoods, call);
|
||||||
|
|
||||||
|
final ReferenceContext referenceContext = new ReferenceContext(genomeLocParser, genomeLocParser.createGenomeLoc(mergedVC.getChr(), mergedVC.getStart(), mergedVC.getEnd()), refLoc, ref);
|
||||||
|
VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(referenceContext, tracker, readAlleleLikelihoods, call, false);
|
||||||
|
|
||||||
|
if( call.getAlleles().size() != mergedVC.getAlleles().size() )
|
||||||
|
annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall);
|
||||||
|
|
||||||
|
// maintain the set of all called haplotypes
|
||||||
|
call.getAlleles().stream().map(alleleMapper::get).filter(Objects::nonNull).forEach(calledHaplotypes::addAll);
|
||||||
|
returnCalls.add( annotatedCall );
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: understand effect of enabling this for somatic calling...
|
// TODO: understand effect of enabling this for somatic calling...
|
||||||
final List<VariantContext> phasedCalls = doPhysicalPhasing ? phaseCalls(returnCalls, calledHaplotypes) : returnCalls;
|
final List<VariantContext> outputCalls = doPhysicalPhasing ? phaseCalls(returnCalls, calledHaplotypes) : returnCalls;
|
||||||
return new CalledHaplotypes(phasedCalls, calledHaplotypes);
|
return new CalledHaplotypes(outputCalls, calledHaplotypes);
|
||||||
//return new CalledHaplotypes(returnCalls, calledHaplotypes);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void verifySamplePresence(String sampleName, List<String> samples) {
|
private void verifySamplePresence(final String sampleName, final List<String> samples) {
|
||||||
if (!samples.contains(sampleName)) {
|
if (!samples.contains(sampleName)) {
|
||||||
throw new IllegalArgumentException("Unable to find sample name "+sampleName+"in sample list of " + StringUtil.join(",", samples));
|
throw new IllegalArgumentException("Unable to find sample name "+sampleName+"in sample list of " + StringUtil.join(",", samples));
|
||||||
}
|
}
|
||||||
|
|
@ -459,7 +385,7 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
||||||
* @param alleleFractions allele fraction(s) for alternate allele(s)
|
* @param alleleFractions allele fraction(s) for alternate allele(s)
|
||||||
*
|
*
|
||||||
* @return genotype likelihoods for homRef and het for each alternate allele
|
* @return genotype likelihoods for homRef and het for each alternate allele
|
||||||
*/
|
*/
|
||||||
private PerAlleleCollection<Double> getHetGenotypeLogLikelihoods(final VariantContext mergedVC,
|
private PerAlleleCollection<Double> getHetGenotypeLogLikelihoods(final VariantContext mergedVC,
|
||||||
final PerReadAlleleLikelihoodMap tumorPRALM,
|
final PerReadAlleleLikelihoodMap tumorPRALM,
|
||||||
final Map<String, Integer> originalNormalMQs,
|
final Map<String, Integer> originalNormalMQs,
|
||||||
|
|
@ -470,13 +396,11 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
||||||
}
|
}
|
||||||
|
|
||||||
final PerAlleleCollection<MutableDouble> genotypeLogLikelihoods = PerAlleleCollection.createPerRefAndAltAlleleCollection();
|
final PerAlleleCollection<MutableDouble> genotypeLogLikelihoods = PerAlleleCollection.createPerRefAndAltAlleleCollection();
|
||||||
for (final Allele allele : mergedVC.getAlleles()){
|
mergedVC.getAlleles().forEach(a -> genotypeLogLikelihoods.set(a, new MutableDouble(0)));
|
||||||
genotypeLogLikelihoods.set(allele, new MutableDouble(0.0));
|
|
||||||
}
|
|
||||||
|
|
||||||
final Allele refAllele = mergedVC.getReference();
|
final Allele refAllele = mergedVC.getReference();
|
||||||
for(Map.Entry<GATKSAMRecord,Map<Allele, Double>> readAlleleLikelihoodMap : tumorPRALM.getLikelihoodReadMap().entrySet()) {
|
for(Map.Entry<GATKSAMRecord,Map<Allele, Double>> readAlleleLikelihoodMap : tumorPRALM.getLikelihoodReadMap().entrySet()) {
|
||||||
Map<Allele, Double> alleleLikelihoodMap = readAlleleLikelihoodMap.getValue();
|
final Map<Allele, Double> alleleLikelihoodMap = readAlleleLikelihoodMap.getValue();
|
||||||
if (originalNormalMQs.get(readAlleleLikelihoodMap.getKey().getReadName()) == 0) {
|
if (originalNormalMQs.get(readAlleleLikelihoodMap.getKey().getReadName()) == 0) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
@ -484,9 +408,9 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
||||||
final double readRefLogLikelihood = alleleLikelihoodMap.get(refAllele);
|
final double readRefLogLikelihood = alleleLikelihoodMap.get(refAllele);
|
||||||
genotypeLogLikelihoods.getRef().add(readRefLogLikelihood);
|
genotypeLogLikelihoods.getRef().add(readRefLogLikelihood);
|
||||||
|
|
||||||
for (Allele altAllele : alleleFractions.getAltAlleles()) {
|
for (final Allele altAllele : alleleFractions.getAltAlleles()) {
|
||||||
double readAltLogLikelihood = alleleLikelihoodMap.get(altAllele);
|
final double readAltLogLikelihood = alleleLikelihoodMap.get(altAllele);
|
||||||
double adjustedReadAltLL = Math.log10(
|
final double adjustedReadAltLL = Math.log10(
|
||||||
Math.pow(10, readRefLogLikelihood) * (1 - alleleFractions.getAlt(altAllele)) +
|
Math.pow(10, readRefLogLikelihood) * (1 - alleleFractions.getAlt(altAllele)) +
|
||||||
Math.pow(10, readAltLogLikelihood) * alleleFractions.getAlt(altAllele)
|
Math.pow(10, readAltLogLikelihood) * alleleFractions.getAlt(altAllele)
|
||||||
);
|
);
|
||||||
|
|
@ -515,9 +439,9 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
||||||
final PerAlleleCollection<Integer> alleleCounts = getRefAltCount(vc, pralm, oneStrandOnly);
|
final PerAlleleCollection<Integer> alleleCounts = getRefAltCount(vc, pralm, oneStrandOnly);
|
||||||
final PerAlleleCollection<Double> alleleFractions = PerAlleleCollection.createPerAltAlleleCollection();
|
final PerAlleleCollection<Double> alleleFractions = PerAlleleCollection.createPerAltAlleleCollection();
|
||||||
|
|
||||||
int refCount = alleleCounts.getRef();
|
final int refCount = alleleCounts.getRef();
|
||||||
for ( final Allele altAllele : vc.getAlternateAlleles() ) {
|
for ( final Allele altAllele : vc.getAlternateAlleles() ) {
|
||||||
int altCount = alleleCounts.getAlt(altAllele);
|
final int altCount = alleleCounts.getAlt(altAllele);
|
||||||
double alleleFraction = (double) altCount / (refCount + altCount);
|
double alleleFraction = (double) altCount / (refCount + altCount);
|
||||||
// weird case, but I've seen it happen in one strand cases
|
// weird case, but I've seen it happen in one strand cases
|
||||||
if (refCount == 0 && altCount == refCount ) {
|
if (refCount == 0 && altCount == refCount ) {
|
||||||
|
|
@ -552,16 +476,12 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
||||||
|
|
||||||
|
|
||||||
final PerAlleleCollection<MutableInt> alleleCounts = PerAlleleCollection.createPerRefAndAltAlleleCollection();
|
final PerAlleleCollection<MutableInt> alleleCounts = PerAlleleCollection.createPerRefAndAltAlleleCollection();
|
||||||
|
vcAlleles.stream().forEach(a -> alleleCounts.set(a, new MutableInt(0)));
|
||||||
// initialize the allele counts to 0
|
|
||||||
for (final Allele allele : vcAlleles) {
|
|
||||||
alleleCounts.set(allele, new MutableInt(0));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (final Map.Entry<GATKSAMRecord, Map<Allele, Double>> readAlleleLikelihoodMap : pralm.getLikelihoodReadMap().entrySet()) {
|
for (final Map.Entry<GATKSAMRecord, Map<Allele, Double>> readAlleleLikelihoodMap : pralm.getLikelihoodReadMap().entrySet()) {
|
||||||
final GATKSAMRecord read = readAlleleLikelihoodMap.getKey();
|
final GATKSAMRecord read = readAlleleLikelihoodMap.getKey();
|
||||||
final Map<Allele, Double> alleleLikelihoodMap = readAlleleLikelihoodMap.getValue();
|
final Map<Allele, Double> alleleLikelihoodMap = readAlleleLikelihoodMap.getValue();
|
||||||
MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(alleleLikelihoodMap, vcAlleles);
|
final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(alleleLikelihoodMap, vcAlleles);
|
||||||
|
|
||||||
if (read.getMappingQuality() > 0 && mostLikelyAllele.isInformative()) {
|
if (read.getMappingQuality() > 0 && mostLikelyAllele.isInformative()) {
|
||||||
alleleCounts.get(mostLikelyAllele.getMostLikelyAllele()).increment();
|
alleleCounts.get(mostLikelyAllele.getMostLikelyAllele()).increment();
|
||||||
|
|
@ -581,18 +501,16 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void filterPRALMForOverlappingReads(PerReadAlleleLikelihoodMap pralm, Allele ref, int location, boolean retainMismatches) {
|
private void filterPRALMForOverlappingReads(final PerReadAlleleLikelihoodMap pralm, final Allele ref, final int location, final boolean retainMismatches) {
|
||||||
|
final Map<GATKSAMRecord, Map<Allele, Double>> m = pralm.getLikelihoodReadMap();
|
||||||
Map<GATKSAMRecord, Map<Allele, Double>> m = pralm.getLikelihoodReadMap();
|
|
||||||
|
|
||||||
|
|
||||||
// iterate through the reads, if the name has been seen before we have overlapping (potentially) fragments, so handle them
|
// iterate through the reads, if the name has been seen before we have overlapping (potentially) fragments, so handle them
|
||||||
Map<String, GATKSAMRecord> nameToRead = new HashMap<>();
|
final Map<String, GATKSAMRecord> nameToRead = new HashMap<>();
|
||||||
Set<GATKSAMRecord> readsToKeep = new HashSet<>();
|
final Set<GATKSAMRecord> readsToKeep = new HashSet<>();
|
||||||
|
|
||||||
for(GATKSAMRecord rec : m.keySet()) {
|
for(final GATKSAMRecord rec : m.keySet()) {
|
||||||
// if we haven't seen it... just record the name and add it to the list of reads to keep
|
// if we haven't seen it... just record the name and add it to the list of reads to keep
|
||||||
GATKSAMRecord existing = nameToRead.get(rec.getReadName());
|
final GATKSAMRecord existing = nameToRead.get(rec.getReadName());
|
||||||
if (existing == null) {
|
if (existing == null) {
|
||||||
nameToRead.put(rec.getReadName(), rec);
|
nameToRead.put(rec.getReadName(), rec);
|
||||||
readsToKeep.add(rec);
|
readsToKeep.add(rec);
|
||||||
|
|
@ -604,11 +522,11 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
||||||
// TODO: CHECK IF THE READS BOTH OVERLAP THE POSITION!!!!
|
// TODO: CHECK IF THE READS BOTH OVERLAP THE POSITION!!!!
|
||||||
if ( ReadUtils.isInsideRead(existing, location) && ReadUtils.isInsideRead(rec, location) ) {
|
if ( ReadUtils.isInsideRead(existing, location) && ReadUtils.isInsideRead(rec, location) ) {
|
||||||
|
|
||||||
MostLikelyAllele existingMLA = pralm.getMostLikelyAllele(pralm.getLikelihoodReadMap().get(existing));
|
final MostLikelyAllele existingMLA = PerReadAlleleLikelihoodMap.getMostLikelyAllele(pralm.getLikelihoodReadMap().get(existing));
|
||||||
Allele existingAllele = existingMLA.getMostLikelyAllele();
|
final Allele existingAllele = existingMLA.getMostLikelyAllele();
|
||||||
|
|
||||||
MostLikelyAllele recMLA = pralm.getMostLikelyAllele(pralm.getLikelihoodReadMap().get(rec));
|
final MostLikelyAllele recMLA = PerReadAlleleLikelihoodMap.getMostLikelyAllele(pralm.getLikelihoodReadMap().get(rec));
|
||||||
Allele recAllele = recMLA.getMostLikelyAllele();
|
final Allele recAllele = recMLA.getMostLikelyAllele();
|
||||||
|
|
||||||
// if the reads disagree at this position...
|
// if the reads disagree at this position...
|
||||||
if (!existingAllele.equals(recAllele)) {
|
if (!existingAllele.equals(recAllele)) {
|
||||||
|
|
@ -654,7 +572,7 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void splitPRALMintoForwardAndReverseReads(final PerReadAlleleLikelihoodMap originalPRALM, final PerReadAlleleLikelihoodMap forwardPRALM, final PerReadAlleleLikelihoodMap reversePRALM) {
|
private void splitPRALMintoForwardAndReverseReads(final PerReadAlleleLikelihoodMap originalPRALM, final PerReadAlleleLikelihoodMap forwardPRALM, final PerReadAlleleLikelihoodMap reversePRALM) {
|
||||||
Map<GATKSAMRecord, Map<Allele, Double>> origReadAlleleLikelihoodMap = originalPRALM.getLikelihoodReadMap();
|
final Map<GATKSAMRecord, Map<Allele, Double>> origReadAlleleLikelihoodMap = originalPRALM.getLikelihoodReadMap();
|
||||||
for (final GATKSAMRecord read : origReadAlleleLikelihoodMap.keySet()) {
|
for (final GATKSAMRecord read : origReadAlleleLikelihoodMap.keySet()) {
|
||||||
if (read.isStrandless())
|
if (read.isStrandless())
|
||||||
continue;
|
continue;
|
||||||
|
|
@ -669,17 +587,4 @@ public class SomaticGenotypingEngine extends HaplotypeCallerGenotypingEngine {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Move to utility class so we can use one shared with HaplotypeCallerGenotypingEngine
|
|
||||||
private VariantContext addNonRefSymbolicAllele(final VariantContext mergedVC) {
|
|
||||||
final VariantContextBuilder vcb = new VariantContextBuilder(mergedVC);
|
|
||||||
final List<Allele> originalList = mergedVC.getAlleles();
|
|
||||||
final List<Allele> alleleList = new ArrayList<>(originalList.size() + 1);
|
|
||||||
alleleList.addAll(mergedVC.getAlleles());
|
|
||||||
alleleList.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
|
|
||||||
vcb.alleles(alleleList);
|
|
||||||
return vcb.make();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -55,6 +55,7 @@ import org.apache.commons.math.MathException;
|
||||||
import org.apache.commons.math.distribution.BinomialDistribution;
|
import org.apache.commons.math.distribution.BinomialDistribution;
|
||||||
import org.apache.commons.math.distribution.BinomialDistributionImpl;
|
import org.apache.commons.math.distribution.BinomialDistributionImpl;
|
||||||
import org.apache.commons.math3.util.Pair;
|
import org.apache.commons.math3.util.Pair;
|
||||||
|
import org.broadinstitute.gatk.utils.exceptions.GATKException;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
|
@ -70,7 +71,6 @@ public class TumorPowerCalculator {
|
||||||
private final double tumorLODThreshold;
|
private final double tumorLODThreshold;
|
||||||
private final double contamination;
|
private final double contamination;
|
||||||
private final boolean enableSmoothing;
|
private final boolean enableSmoothing;
|
||||||
public static int numCacheHits = 0;
|
|
||||||
|
|
||||||
private final HashMap<PowerCacheKey, Double> cache = new HashMap<PowerCacheKey, Double>();
|
private final HashMap<PowerCacheKey, Double> cache = new HashMap<PowerCacheKey, Double>();
|
||||||
|
|
||||||
|
|
@ -134,16 +134,18 @@ public class TumorPowerCalculator {
|
||||||
* @throws MathException
|
* @throws MathException
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public double cachedPowerCalculation(final int numReads, final double alleleFraction) throws MathException {
|
public double cachedPowerCalculation(final int numReads, final double alleleFraction) {
|
||||||
PowerCacheKey key = new PowerCacheKey(numReads, alleleFraction);
|
PowerCacheKey key = new PowerCacheKey(numReads, alleleFraction);
|
||||||
// we first look up if power for given number of read and allele fraction has already been computed and stored in the cache.
|
// we first look up if power for given number of read and allele fraction has already been computed and stored in the cache.
|
||||||
// if not we compute it and store it in teh cache.
|
// if not we compute it and store it in teh cache.
|
||||||
Double power = cache.get(key);
|
Double power = cache.get(key);
|
||||||
if (power == null) {
|
if (power == null) {
|
||||||
power = calculatePower(numReads, alleleFraction);
|
try {
|
||||||
|
power = calculatePower(numReads, alleleFraction);
|
||||||
|
} catch (final Exception ex) {
|
||||||
|
throw new GATKException("Power calculation failed", ex);
|
||||||
|
}
|
||||||
cache.put(key, power);
|
cache.put(key, power);
|
||||||
} else {
|
|
||||||
numCacheHits++;
|
|
||||||
}
|
}
|
||||||
return power;
|
return power;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue