Extracted some constant expressions involved HC variation discovery and genotyping.

Addreses issue #1092.
This commit is contained in:
vruano 2015-07-28 15:32:10 -04:00
parent 9d9827f176
commit 02c7876c72
2 changed files with 82 additions and 17 deletions

View File

@ -480,8 +480,39 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
private byte MIN_TAIL_QUALITY;
private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6;
// the minimum length of a read we'd consider using for genotyping
private final static int MIN_READ_LENGTH = 10;
/**
* Minimum (exclusive) average number of high quality bases per soft-clip to consider that a set of soft-clips is a
* high quality set.
*/
private static final double AVERAGE_HQ_SOFTCLIPS_HQ_BASES_THRESHOLD = 6.0;
/**
* Maximum-mininum confidence on a variant to exist to consider the position as a potential variant harbouring locus
* when looking for active regions.
*/
private static final double MAXMIN_CONFIDENCE_FOR_CONSIDERING_A_SITE_AS_POSSIBLE_VARIANT_IN_ACTIVE_REGION_DISCOVERY = 4.0;
/**
* Minimum ploidy assumed when looking for loci that may harbour variation to identify active regions.
* <p>
* By default we take the putative ploidy provided by the user, but this turned out to be too insensitive
* for low ploidy, notoriously with haploid samples. Therefore we impose this minimum.
* </p>
*/
private static final int MINIMUM_PUTATIVE_PLOIDY_FOR_ACTIVE_REGION_DISCOVERY = 2;
/**
* Reads with length lower than this number, after clipping off overhands outside the active region,
* won't be considered for genotyping.
*/
private final static int READ_LENGTH_FILTER_THRESHOLD = 10;
/**
* Reads with mapping quality lower than this number won't be considered for genotyping, even if they have
* passed earlier filters (e.g. the walkers' own min MQ filter).
*/
private static final int READ_QUALITY_FILTER_THRESHOLD = 20;
private SampleList samplesList;
@ -569,14 +600,14 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
final UnifiedArgumentCollection simpleUAC = HCAC.cloneTo(UnifiedArgumentCollection.class);
simpleUAC.outputMode = OutputMode.EMIT_VARIANTS_ONLY;
simpleUAC.genotypingOutputMode = GenotypingOutputMode.DISCOVERY;
simpleUAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, HCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling
simpleUAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, HCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling
simpleUAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING = Math.min(MAXMIN_CONFIDENCE_FOR_CONSIDERING_A_SITE_AS_POSSIBLE_VARIANT_IN_ACTIVE_REGION_DISCOVERY, HCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling
simpleUAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min(MAXMIN_CONFIDENCE_FOR_CONSIDERING_A_SITE_AS_POSSIBLE_VARIANT_IN_ACTIVE_REGION_DISCOVERY, HCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling
simpleUAC.CONTAMINATION_FRACTION = 0.0;
simpleUAC.CONTAMINATION_FRACTION_FILE = null;
simpleUAC.exactCallsLog = null;
// Seems that at least with some test data we can lose genuine haploid variation if we use
// UGs engine with ploidy == 1
simpleUAC.genotypeArgs.samplePloidy = Math.max(2, HCAC.genotypeArgs.samplePloidy);
simpleUAC.genotypeArgs.samplePloidy = Math.max(MINIMUM_PUTATIVE_PLOIDY_FOR_ACTIVE_REGION_DISCOVERY, HCAC.genotypeArgs.samplePloidy);
activeRegionEvaluationGenotyperEngine = new UnifiedGenotypingEngine(simpleUAC,
FixedAFCalculatorProvider.createThreadSafeProvider(getToolkit(),simpleUAC,logger), toolkit);
@ -783,7 +814,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
final VariantCallContext vcOut = activeRegionEvaluationGenotyperEngine.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.SNP);
final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() );
return new ActivityProfileState( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileState.Type.NONE, averageHQSoftClips.mean() );
return new ActivityProfileState( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > AVERAGE_HQ_SOFTCLIPS_HQ_BASES_THRESHOLD ? ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileState.Type.NONE, averageHQSoftClips.mean() );
}
//---------------------------------------------------------------------------------------------------------------
@ -1138,7 +1169,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
private Set<GATKSAMRecord> filterNonPassingReads( final ActiveRegion activeRegion ) {
final Set<GATKSAMRecord> readsToRemove = new LinkedHashSet<>();
for( final GATKSAMRecord rec : activeRegion.getReads() ) {
if( rec.getReadLength() < MIN_READ_LENGTH || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) {
if( rec.getReadLength() < READ_LENGTH_FILTER_THRESHOLD || rec.getMappingQuality() < READ_QUALITY_FILTER_THRESHOLD || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) {
readsToRemove.add(rec);
}
}

View File

@ -97,7 +97,26 @@ public class ReferenceConfidenceModel {
private final static boolean WRITE_DEBUGGING_BAM = false;
private final SAMFileWriter debuggingWriter;
private final static byte REF_MODEL_DELETION_QUAL = (byte) 30;
/**
* Surrogate quality score for no base calls.
* <p>
* This is the quality assigned to deletion (so without its own base-call quality) pile-up elements,
* when assessing the confidence on the hom-ref call at that site.
* </p>
*/
private final static byte REF_MODEL_DELETION_QUAL = 30;
/**
* Base calls with quality threshold lower than this number won't be considered when assessing the
* confidence on the hom-ref call.
*/
private final static byte BASE_QUAL_THRESHOLD = 6;
/**
* Only base calls with quality strictly greater than this constant,
* will be considered high quality if they are part of a soft-clip.
*/
private final static byte HQ_BASE_QUALITY_SOFTCLIP_THRESHOLD = 28;
/**
* Create a new ReferenceConfidenceModel
@ -147,7 +166,6 @@ public class ReferenceConfidenceModel {
if ( debuggingWriter != null ) debuggingWriter.close();
}
/**
* Calculate the reference confidence for a single sample given the its read data
*
@ -209,7 +227,7 @@ public class ReferenceConfidenceModel {
// Assume infinite population on a single sample.
final int refOffset = offset + globalRefOffset;
final byte refBase = ref[refOffset];
final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(ploidy, pileup, refBase, (byte)6, null);
final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(ploidy, pileup, refBase, BASE_QUAL_THRESHOLD, null);
homRefCalc.capByHomRefLikelihood();
final Allele refAllele = Allele.create(refBase, true);
@ -269,8 +287,27 @@ public class ReferenceConfidenceModel {
protected static final int MAX_N_INDEL_INFORMATIVE_READS = 40; // more than this is overkill because GQs are capped at 99 anyway
private static final int INITIAL_INDEL_LK_CACHE_PLOIDY_CAPACITY = 20;
private static GenotypeLikelihoods[][] indelPLCache = new GenotypeLikelihoods[INITIAL_INDEL_LK_CACHE_PLOIDY_CAPACITY + 1][];
/**
* Indel error rate for the indel model used to assess the confidence on the hom-ref call.
*/
private static final double INDEL_ERROR_RATE = -4.5; // 10^-4.5 indel errors per bp
/**
* Phred scaled qual value that corresponds to the {@link #INDEL_ERROR_RATE indel error rate}.
*/
private static final byte INDEL_QUAL = (byte) Math.round((INDEL_ERROR_RATE * -10.0));
/**
* No indel likelihood (ref allele) used in the indel model to assess the confidence on the hom-ref call.
*/
private static final double NO_INDEL_LIKELIHOOD = QualityUtils.qualToProbLog10(INDEL_QUAL);
/**
* Indel likelihood (alt. allele) used in the indel model to assess the confidence on the hom-ref call.
*/
private static final double INDEL_LIKELIHOOD = QualityUtils.qualToErrorProbLog10(INDEL_QUAL);
private final GenotypeLikelihoods indelPLCache(final int ploidy, final int nInformativeReads) {
return initializeIndelPLCache(ploidy)[nInformativeReads];
}
@ -287,14 +324,11 @@ public class ReferenceConfidenceModel {
final GenotypeLikelihoods[] result = new GenotypeLikelihoods[MAX_N_INDEL_INFORMATIVE_READS + 1];
result[0] = GenotypeLikelihoods.fromLog10Likelihoods(new double[ploidy + 1]);
for( int nInformativeReads = 1; nInformativeReads <= MAX_N_INDEL_INFORMATIVE_READS; nInformativeReads++ ) {
final byte indelQual = (byte) Math.round((INDEL_ERROR_RATE * -10));
final double refLikelihood = QualityUtils.qualToProbLog10(indelQual);
final double altLikelihood = QualityUtils.qualToErrorProbLog10(indelQual);
double[] PLs = new double[ploidy + 1];
PLs[0] = nInformativeReads * refLikelihood;
PLs[0] = nInformativeReads * NO_INDEL_LIKELIHOOD;
for (int altCount = 1; altCount <= ploidy; altCount++) {
final double refLikelihoodAccum = refLikelihood + MathUtils.Log10Cache.get(ploidy - altCount);
final double altLikelihoodAccum = altLikelihood + MathUtils.Log10Cache.get(altCount);
final double refLikelihoodAccum = NO_INDEL_LIKELIHOOD + MathUtils.Log10Cache.get(ploidy - altCount);
final double altLikelihoodAccum = INDEL_LIKELIHOOD + MathUtils.Log10Cache.get(altCount);
PLs[altCount] = nInformativeReads * (MathUtils.approximateLog10SumLog10(refLikelihoodAccum ,altLikelihoodAccum) + denominator);
}
result[nInformativeReads] = GenotypeLikelihoods.fromLog10Likelihoods(PLs);
@ -361,7 +395,7 @@ public class ReferenceConfidenceModel {
referenceLikelihood + MathUtils.Log10Cache.get(j),
nonRefLikelihood + MathUtils.Log10Cache.get(i));
if (isAlt && hqSoftClips != null && element.isNextToSoftClip())
hqSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(element.getRead(), (byte) 28));
hqSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(element.getRead(), HQ_BASE_QUALITY_SOFTCLIP_THRESHOLD));
}
/**