diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java index a544f064b..7d41f37df 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java @@ -480,8 +480,39 @@ public class HaplotypeCaller extends ActiveRegionWalker, In private byte MIN_TAIL_QUALITY; private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6; - // the minimum length of a read we'd consider using for genotyping - private final static int MIN_READ_LENGTH = 10; + /** + * Minimum (exclusive) average number of high quality bases per soft-clip to consider that a set of soft-clips is a + * high quality set. + */ + private static final double AVERAGE_HQ_SOFTCLIPS_HQ_BASES_THRESHOLD = 6.0; + + /** + * Maximum-mininum confidence on a variant to exist to consider the position as a potential variant harbouring locus + * when looking for active regions. + */ + private static final double MAXMIN_CONFIDENCE_FOR_CONSIDERING_A_SITE_AS_POSSIBLE_VARIANT_IN_ACTIVE_REGION_DISCOVERY = 4.0; + + /** + * Minimum ploidy assumed when looking for loci that may harbour variation to identify active regions. + *

+ * By default we take the putative ploidy provided by the user, but this turned out to be too insensitive + * for low ploidy, notoriously with haploid samples. Therefore we impose this minimum. + *

+ */ + private static final int MINIMUM_PUTATIVE_PLOIDY_FOR_ACTIVE_REGION_DISCOVERY = 2; + + + /** + * Reads with length lower than this number, after clipping off overhands outside the active region, + * won't be considered for genotyping. + */ + private final static int READ_LENGTH_FILTER_THRESHOLD = 10; + + /** + * Reads with mapping quality lower than this number won't be considered for genotyping, even if they have + * passed earlier filters (e.g. the walkers' own min MQ filter). + */ + private static final int READ_QUALITY_FILTER_THRESHOLD = 20; private SampleList samplesList; @@ -569,14 +600,14 @@ public class HaplotypeCaller extends ActiveRegionWalker, In final UnifiedArgumentCollection simpleUAC = HCAC.cloneTo(UnifiedArgumentCollection.class); simpleUAC.outputMode = OutputMode.EMIT_VARIANTS_ONLY; simpleUAC.genotypingOutputMode = GenotypingOutputMode.DISCOVERY; - simpleUAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, HCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling - simpleUAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, HCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING = Math.min(MAXMIN_CONFIDENCE_FOR_CONSIDERING_A_SITE_AS_POSSIBLE_VARIANT_IN_ACTIVE_REGION_DISCOVERY, HCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min(MAXMIN_CONFIDENCE_FOR_CONSIDERING_A_SITE_AS_POSSIBLE_VARIANT_IN_ACTIVE_REGION_DISCOVERY, HCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling simpleUAC.CONTAMINATION_FRACTION = 0.0; simpleUAC.CONTAMINATION_FRACTION_FILE = null; simpleUAC.exactCallsLog = null; // Seems that at least with some test data we can lose genuine haploid variation if we use // UGs engine with ploidy == 1 - simpleUAC.genotypeArgs.samplePloidy = Math.max(2, HCAC.genotypeArgs.samplePloidy); + simpleUAC.genotypeArgs.samplePloidy = Math.max(MINIMUM_PUTATIVE_PLOIDY_FOR_ACTIVE_REGION_DISCOVERY, HCAC.genotypeArgs.samplePloidy); activeRegionEvaluationGenotyperEngine = new UnifiedGenotypingEngine(simpleUAC, FixedAFCalculatorProvider.createThreadSafeProvider(getToolkit(),simpleUAC,logger), toolkit); @@ -783,7 +814,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In final VariantCallContext vcOut = activeRegionEvaluationGenotyperEngine.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.SNP); final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() ); - return new ActivityProfileState( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileState.Type.NONE, averageHQSoftClips.mean() ); + return new ActivityProfileState( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > AVERAGE_HQ_SOFTCLIPS_HQ_BASES_THRESHOLD ? ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileState.Type.NONE, averageHQSoftClips.mean() ); } //--------------------------------------------------------------------------------------------------------------- @@ -1138,7 +1169,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In private Set filterNonPassingReads( final ActiveRegion activeRegion ) { final Set readsToRemove = new LinkedHashSet<>(); for( final GATKSAMRecord rec : activeRegion.getReads() ) { - if( rec.getReadLength() < MIN_READ_LENGTH || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) { + if( rec.getReadLength() < READ_LENGTH_FILTER_THRESHOLD || rec.getMappingQuality() < READ_QUALITY_FILTER_THRESHOLD || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) { readsToRemove.add(rec); } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java index 8ce960ca1..6f62457ec 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java @@ -97,7 +97,26 @@ public class ReferenceConfidenceModel { private final static boolean WRITE_DEBUGGING_BAM = false; private final SAMFileWriter debuggingWriter; - private final static byte REF_MODEL_DELETION_QUAL = (byte) 30; + /** + * Surrogate quality score for no base calls. + *

+ * This is the quality assigned to deletion (so without its own base-call quality) pile-up elements, + * when assessing the confidence on the hom-ref call at that site. + *

+ */ + private final static byte REF_MODEL_DELETION_QUAL = 30; + + /** + * Base calls with quality threshold lower than this number won't be considered when assessing the + * confidence on the hom-ref call. + */ + private final static byte BASE_QUAL_THRESHOLD = 6; + + /** + * Only base calls with quality strictly greater than this constant, + * will be considered high quality if they are part of a soft-clip. + */ + private final static byte HQ_BASE_QUALITY_SOFTCLIP_THRESHOLD = 28; /** * Create a new ReferenceConfidenceModel @@ -147,7 +166,6 @@ public class ReferenceConfidenceModel { if ( debuggingWriter != null ) debuggingWriter.close(); } - /** * Calculate the reference confidence for a single sample given the its read data * @@ -209,7 +227,7 @@ public class ReferenceConfidenceModel { // Assume infinite population on a single sample. final int refOffset = offset + globalRefOffset; final byte refBase = ref[refOffset]; - final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(ploidy, pileup, refBase, (byte)6, null); + final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(ploidy, pileup, refBase, BASE_QUAL_THRESHOLD, null); homRefCalc.capByHomRefLikelihood(); final Allele refAllele = Allele.create(refBase, true); @@ -269,8 +287,27 @@ public class ReferenceConfidenceModel { protected static final int MAX_N_INDEL_INFORMATIVE_READS = 40; // more than this is overkill because GQs are capped at 99 anyway private static final int INITIAL_INDEL_LK_CACHE_PLOIDY_CAPACITY = 20; private static GenotypeLikelihoods[][] indelPLCache = new GenotypeLikelihoods[INITIAL_INDEL_LK_CACHE_PLOIDY_CAPACITY + 1][]; + + /** + * Indel error rate for the indel model used to assess the confidence on the hom-ref call. + */ private static final double INDEL_ERROR_RATE = -4.5; // 10^-4.5 indel errors per bp + /** + * Phred scaled qual value that corresponds to the {@link #INDEL_ERROR_RATE indel error rate}. + */ + private static final byte INDEL_QUAL = (byte) Math.round((INDEL_ERROR_RATE * -10.0)); + + /** + * No indel likelihood (ref allele) used in the indel model to assess the confidence on the hom-ref call. + */ + private static final double NO_INDEL_LIKELIHOOD = QualityUtils.qualToProbLog10(INDEL_QUAL); + + /** + * Indel likelihood (alt. allele) used in the indel model to assess the confidence on the hom-ref call. + */ + private static final double INDEL_LIKELIHOOD = QualityUtils.qualToErrorProbLog10(INDEL_QUAL); + private final GenotypeLikelihoods indelPLCache(final int ploidy, final int nInformativeReads) { return initializeIndelPLCache(ploidy)[nInformativeReads]; } @@ -287,14 +324,11 @@ public class ReferenceConfidenceModel { final GenotypeLikelihoods[] result = new GenotypeLikelihoods[MAX_N_INDEL_INFORMATIVE_READS + 1]; result[0] = GenotypeLikelihoods.fromLog10Likelihoods(new double[ploidy + 1]); for( int nInformativeReads = 1; nInformativeReads <= MAX_N_INDEL_INFORMATIVE_READS; nInformativeReads++ ) { - final byte indelQual = (byte) Math.round((INDEL_ERROR_RATE * -10)); - final double refLikelihood = QualityUtils.qualToProbLog10(indelQual); - final double altLikelihood = QualityUtils.qualToErrorProbLog10(indelQual); double[] PLs = new double[ploidy + 1]; - PLs[0] = nInformativeReads * refLikelihood; + PLs[0] = nInformativeReads * NO_INDEL_LIKELIHOOD; for (int altCount = 1; altCount <= ploidy; altCount++) { - final double refLikelihoodAccum = refLikelihood + MathUtils.Log10Cache.get(ploidy - altCount); - final double altLikelihoodAccum = altLikelihood + MathUtils.Log10Cache.get(altCount); + final double refLikelihoodAccum = NO_INDEL_LIKELIHOOD + MathUtils.Log10Cache.get(ploidy - altCount); + final double altLikelihoodAccum = INDEL_LIKELIHOOD + MathUtils.Log10Cache.get(altCount); PLs[altCount] = nInformativeReads * (MathUtils.approximateLog10SumLog10(refLikelihoodAccum ,altLikelihoodAccum) + denominator); } result[nInformativeReads] = GenotypeLikelihoods.fromLog10Likelihoods(PLs); @@ -361,7 +395,7 @@ public class ReferenceConfidenceModel { referenceLikelihood + MathUtils.Log10Cache.get(j), nonRefLikelihood + MathUtils.Log10Cache.get(i)); if (isAlt && hqSoftClips != null && element.isNextToSoftClip()) - hqSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(element.getRead(), (byte) 28)); + hqSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(element.getRead(), HQ_BASE_QUALITY_SOFTCLIP_THRESHOLD)); } /**