Merge pull request #1099 from broadinstitute/vrr_magic_numbers
Extracted some constant expressions involved HC variation discovery a…
This commit is contained in:
commit
bb4c9fa1d3
|
|
@ -480,8 +480,39 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
private byte MIN_TAIL_QUALITY;
|
||||
private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6;
|
||||
|
||||
// the minimum length of a read we'd consider using for genotyping
|
||||
private final static int MIN_READ_LENGTH = 10;
|
||||
/**
|
||||
* Minimum (exclusive) average number of high quality bases per soft-clip to consider that a set of soft-clips is a
|
||||
* high quality set.
|
||||
*/
|
||||
private static final double AVERAGE_HQ_SOFTCLIPS_HQ_BASES_THRESHOLD = 6.0;
|
||||
|
||||
/**
|
||||
* Maximum-mininum confidence on a variant to exist to consider the position as a potential variant harbouring locus
|
||||
* when looking for active regions.
|
||||
*/
|
||||
private static final double MAXMIN_CONFIDENCE_FOR_CONSIDERING_A_SITE_AS_POSSIBLE_VARIANT_IN_ACTIVE_REGION_DISCOVERY = 4.0;
|
||||
|
||||
/**
|
||||
* Minimum ploidy assumed when looking for loci that may harbour variation to identify active regions.
|
||||
* <p>
|
||||
* By default we take the putative ploidy provided by the user, but this turned out to be too insensitive
|
||||
* for low ploidy, notoriously with haploid samples. Therefore we impose this minimum.
|
||||
* </p>
|
||||
*/
|
||||
private static final int MINIMUM_PUTATIVE_PLOIDY_FOR_ACTIVE_REGION_DISCOVERY = 2;
|
||||
|
||||
|
||||
/**
|
||||
* Reads with length lower than this number, after clipping off overhands outside the active region,
|
||||
* won't be considered for genotyping.
|
||||
*/
|
||||
private final static int READ_LENGTH_FILTER_THRESHOLD = 10;
|
||||
|
||||
/**
|
||||
* Reads with mapping quality lower than this number won't be considered for genotyping, even if they have
|
||||
* passed earlier filters (e.g. the walkers' own min MQ filter).
|
||||
*/
|
||||
private static final int READ_QUALITY_FILTER_THRESHOLD = 20;
|
||||
|
||||
private SampleList samplesList;
|
||||
|
||||
|
|
@ -569,14 +600,14 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
final UnifiedArgumentCollection simpleUAC = HCAC.cloneTo(UnifiedArgumentCollection.class);
|
||||
simpleUAC.outputMode = OutputMode.EMIT_VARIANTS_ONLY;
|
||||
simpleUAC.genotypingOutputMode = GenotypingOutputMode.DISCOVERY;
|
||||
simpleUAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, HCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
simpleUAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, HCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
simpleUAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING = Math.min(MAXMIN_CONFIDENCE_FOR_CONSIDERING_A_SITE_AS_POSSIBLE_VARIANT_IN_ACTIVE_REGION_DISCOVERY, HCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
simpleUAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min(MAXMIN_CONFIDENCE_FOR_CONSIDERING_A_SITE_AS_POSSIBLE_VARIANT_IN_ACTIVE_REGION_DISCOVERY, HCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
simpleUAC.CONTAMINATION_FRACTION = 0.0;
|
||||
simpleUAC.CONTAMINATION_FRACTION_FILE = null;
|
||||
simpleUAC.exactCallsLog = null;
|
||||
// Seems that at least with some test data we can lose genuine haploid variation if we use
|
||||
// UGs engine with ploidy == 1
|
||||
simpleUAC.genotypeArgs.samplePloidy = Math.max(2, HCAC.genotypeArgs.samplePloidy);
|
||||
simpleUAC.genotypeArgs.samplePloidy = Math.max(MINIMUM_PUTATIVE_PLOIDY_FOR_ACTIVE_REGION_DISCOVERY, HCAC.genotypeArgs.samplePloidy);
|
||||
|
||||
activeRegionEvaluationGenotyperEngine = new UnifiedGenotypingEngine(simpleUAC,
|
||||
FixedAFCalculatorProvider.createThreadSafeProvider(getToolkit(),simpleUAC,logger), toolkit);
|
||||
|
|
@ -783,7 +814,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
final VariantCallContext vcOut = activeRegionEvaluationGenotyperEngine.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.SNP);
|
||||
final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() );
|
||||
|
||||
return new ActivityProfileState( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileState.Type.NONE, averageHQSoftClips.mean() );
|
||||
return new ActivityProfileState( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > AVERAGE_HQ_SOFTCLIPS_HQ_BASES_THRESHOLD ? ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileState.Type.NONE, averageHQSoftClips.mean() );
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -1138,7 +1169,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
private Set<GATKSAMRecord> filterNonPassingReads( final ActiveRegion activeRegion ) {
|
||||
final Set<GATKSAMRecord> readsToRemove = new LinkedHashSet<>();
|
||||
for( final GATKSAMRecord rec : activeRegion.getReads() ) {
|
||||
if( rec.getReadLength() < MIN_READ_LENGTH || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) {
|
||||
if( rec.getReadLength() < READ_LENGTH_FILTER_THRESHOLD || rec.getMappingQuality() < READ_QUALITY_FILTER_THRESHOLD || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) {
|
||||
readsToRemove.add(rec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -97,7 +97,26 @@ public class ReferenceConfidenceModel {
|
|||
private final static boolean WRITE_DEBUGGING_BAM = false;
|
||||
private final SAMFileWriter debuggingWriter;
|
||||
|
||||
private final static byte REF_MODEL_DELETION_QUAL = (byte) 30;
|
||||
/**
|
||||
* Surrogate quality score for no base calls.
|
||||
* <p>
|
||||
* This is the quality assigned to deletion (so without its own base-call quality) pile-up elements,
|
||||
* when assessing the confidence on the hom-ref call at that site.
|
||||
* </p>
|
||||
*/
|
||||
private final static byte REF_MODEL_DELETION_QUAL = 30;
|
||||
|
||||
/**
|
||||
* Base calls with quality threshold lower than this number won't be considered when assessing the
|
||||
* confidence on the hom-ref call.
|
||||
*/
|
||||
private final static byte BASE_QUAL_THRESHOLD = 6;
|
||||
|
||||
/**
|
||||
* Only base calls with quality strictly greater than this constant,
|
||||
* will be considered high quality if they are part of a soft-clip.
|
||||
*/
|
||||
private final static byte HQ_BASE_QUALITY_SOFTCLIP_THRESHOLD = 28;
|
||||
|
||||
/**
|
||||
* Create a new ReferenceConfidenceModel
|
||||
|
|
@ -147,7 +166,6 @@ public class ReferenceConfidenceModel {
|
|||
if ( debuggingWriter != null ) debuggingWriter.close();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Calculate the reference confidence for a single sample given the its read data
|
||||
*
|
||||
|
|
@ -209,7 +227,7 @@ public class ReferenceConfidenceModel {
|
|||
// Assume infinite population on a single sample.
|
||||
final int refOffset = offset + globalRefOffset;
|
||||
final byte refBase = ref[refOffset];
|
||||
final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(ploidy, pileup, refBase, (byte)6, null);
|
||||
final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(ploidy, pileup, refBase, BASE_QUAL_THRESHOLD, null);
|
||||
homRefCalc.capByHomRefLikelihood();
|
||||
|
||||
final Allele refAllele = Allele.create(refBase, true);
|
||||
|
|
@ -269,8 +287,27 @@ public class ReferenceConfidenceModel {
|
|||
protected static final int MAX_N_INDEL_INFORMATIVE_READS = 40; // more than this is overkill because GQs are capped at 99 anyway
|
||||
private static final int INITIAL_INDEL_LK_CACHE_PLOIDY_CAPACITY = 20;
|
||||
private static GenotypeLikelihoods[][] indelPLCache = new GenotypeLikelihoods[INITIAL_INDEL_LK_CACHE_PLOIDY_CAPACITY + 1][];
|
||||
|
||||
/**
|
||||
* Indel error rate for the indel model used to assess the confidence on the hom-ref call.
|
||||
*/
|
||||
private static final double INDEL_ERROR_RATE = -4.5; // 10^-4.5 indel errors per bp
|
||||
|
||||
/**
|
||||
* Phred scaled qual value that corresponds to the {@link #INDEL_ERROR_RATE indel error rate}.
|
||||
*/
|
||||
private static final byte INDEL_QUAL = (byte) Math.round((INDEL_ERROR_RATE * -10.0));
|
||||
|
||||
/**
|
||||
* No indel likelihood (ref allele) used in the indel model to assess the confidence on the hom-ref call.
|
||||
*/
|
||||
private static final double NO_INDEL_LIKELIHOOD = QualityUtils.qualToProbLog10(INDEL_QUAL);
|
||||
|
||||
/**
|
||||
* Indel likelihood (alt. allele) used in the indel model to assess the confidence on the hom-ref call.
|
||||
*/
|
||||
private static final double INDEL_LIKELIHOOD = QualityUtils.qualToErrorProbLog10(INDEL_QUAL);
|
||||
|
||||
private final GenotypeLikelihoods indelPLCache(final int ploidy, final int nInformativeReads) {
|
||||
return initializeIndelPLCache(ploidy)[nInformativeReads];
|
||||
}
|
||||
|
|
@ -287,14 +324,11 @@ public class ReferenceConfidenceModel {
|
|||
final GenotypeLikelihoods[] result = new GenotypeLikelihoods[MAX_N_INDEL_INFORMATIVE_READS + 1];
|
||||
result[0] = GenotypeLikelihoods.fromLog10Likelihoods(new double[ploidy + 1]);
|
||||
for( int nInformativeReads = 1; nInformativeReads <= MAX_N_INDEL_INFORMATIVE_READS; nInformativeReads++ ) {
|
||||
final byte indelQual = (byte) Math.round((INDEL_ERROR_RATE * -10));
|
||||
final double refLikelihood = QualityUtils.qualToProbLog10(indelQual);
|
||||
final double altLikelihood = QualityUtils.qualToErrorProbLog10(indelQual);
|
||||
double[] PLs = new double[ploidy + 1];
|
||||
PLs[0] = nInformativeReads * refLikelihood;
|
||||
PLs[0] = nInformativeReads * NO_INDEL_LIKELIHOOD;
|
||||
for (int altCount = 1; altCount <= ploidy; altCount++) {
|
||||
final double refLikelihoodAccum = refLikelihood + MathUtils.Log10Cache.get(ploidy - altCount);
|
||||
final double altLikelihoodAccum = altLikelihood + MathUtils.Log10Cache.get(altCount);
|
||||
final double refLikelihoodAccum = NO_INDEL_LIKELIHOOD + MathUtils.Log10Cache.get(ploidy - altCount);
|
||||
final double altLikelihoodAccum = INDEL_LIKELIHOOD + MathUtils.Log10Cache.get(altCount);
|
||||
PLs[altCount] = nInformativeReads * (MathUtils.approximateLog10SumLog10(refLikelihoodAccum ,altLikelihoodAccum) + denominator);
|
||||
}
|
||||
result[nInformativeReads] = GenotypeLikelihoods.fromLog10Likelihoods(PLs);
|
||||
|
|
@ -361,7 +395,7 @@ public class ReferenceConfidenceModel {
|
|||
referenceLikelihood + MathUtils.Log10Cache.get(j),
|
||||
nonRefLikelihood + MathUtils.Log10Cache.get(i));
|
||||
if (isAlt && hqSoftClips != null && element.isNextToSoftClip())
|
||||
hqSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(element.getRead(), (byte) 28));
|
||||
hqSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(element.getRead(), HQ_BASE_QUALITY_SOFTCLIP_THRESHOLD));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
Loading…
Reference in New Issue