Feature requested by Reich lab and Paavo lab in Leipzig for ancient DNA processing:
-- When doing cross-species comparisons and studying population history and ancient DNA data, having SOME measure of confidence is needed at every single site that doesn't depend on the reference base, even in a naive per-site SNP mode. Old versions of GATK provided GQ and some wrong PL values at reference sites but these were wrong. This commit addresses this need by adding a new UG command line argument, -allSitePLs, that, if enabled will: a) Emit all 3 ALT snp alleles in the ALT column. b) Emit all corresponding 10 PL values. It's up to the user to process these PL values downstream to make sense of these. Note that, in order to follow VCF spec, the QUAL field in a reference call when there are non-null ALT alleles present will be zero, so QUAL will be useless and filtering will need to be done based on other fields. -- Tweaks and fixes to processing pipelines for Reich lab.
This commit is contained in:
parent
fce448cc9e
commit
f6025d25ae
|
|
@ -147,9 +147,17 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
// if we only want variants, then we don't need to calculate genotype likelihoods
|
||||
if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY )
|
||||
return builder.make();
|
||||
// if user requires all PLs at all sites, add all possible alt alleles
|
||||
else if (UAC.annotateAllSitesWithPLs) {
|
||||
for ( final byte base : BaseUtils.BASES ) {
|
||||
if ( base != refBase )
|
||||
alleles.add(Allele.create(base));
|
||||
}
|
||||
}
|
||||
|
||||
// otherwise, choose any alternate allele (it doesn't really matter)
|
||||
alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(indexOfRefBase == 0 ? 1 : 0)));
|
||||
else
|
||||
// otherwise, choose any alternate allele (it doesn't really matter)
|
||||
alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(indexOfRefBase == 0 ? 1 : 0)));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -52,6 +52,9 @@ import org.broadinstitute.sting.utils.pairhmm.PairHMM;
|
|||
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
public class UnifiedArgumentCollection extends StandardCallerArgumentCollection {
|
||||
|
||||
@Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false)
|
||||
|
|
@ -95,6 +98,18 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
|
||||
public Double MAX_DELETION_FRACTION = 0.05;
|
||||
|
||||
/**
|
||||
* Advanced, experimental argument: if SNP likelihood model is specified, and if EMIT_ALL_SITES output mode is set, when we set this argument then we will also emit PLs at all sites.
|
||||
* This will give a measure of reference confidence and a measure of which alt alleles are more plausible (if any).
|
||||
* WARNINGS:
|
||||
* - This feature will inflate VCF file size considerably.
|
||||
* - All SNP ALT alleles will be emitted with corresponding 10 PL values.
|
||||
* - An error will be emitted if EMIT_ALL_SITES is not set, or if anything other than diploid SNP model is used
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "allSitePLs", shortName = "allSitePLs", doc = "Annotate all sites with PLs", required = false)
|
||||
public boolean annotateAllSitesWithPLs = false;
|
||||
|
||||
// indel-related arguments
|
||||
/**
|
||||
* A candidate indel is genotyped (and potentially called) if there are this number of reads with a consensus indel at a site.
|
||||
|
|
@ -247,7 +262,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
this.EXCLUDE_FILTERED_REFERENCE_SITES = uac.EXCLUDE_FILTERED_REFERENCE_SITES;
|
||||
this.IGNORE_LANE_INFO = uac.IGNORE_LANE_INFO;
|
||||
this.pairHMM = uac.pairHMM;
|
||||
|
||||
this.annotateAllSitesWithPLs = uac.annotateAllSitesWithPLs;
|
||||
// todo- arguments to remove
|
||||
this.IGNORE_SNP_ALLELES = uac.IGNORE_SNP_ALLELES;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -168,6 +168,13 @@ public class UnifiedGenotyperEngine {
|
|||
filter.add(LOW_QUAL_FILTER_NAME);
|
||||
|
||||
determineGLModelsToUse();
|
||||
|
||||
// do argument checking
|
||||
if (UAC.annotateAllSitesWithPLs) {
|
||||
if (!modelsToUse.contains(GenotypeLikelihoodsCalculationModel.Model.SNP))
|
||||
throw new IllegalArgumentException("Invalid genotype likelihood model specification: Only diploid SNP model can be used in conjunction with option allSitePLs");
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -439,7 +446,8 @@ public class UnifiedGenotyperEngine {
|
|||
bestGuessIsRef = false;
|
||||
}
|
||||
// if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele
|
||||
else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ||
|
||||
UAC.annotateAllSitesWithPLs) {
|
||||
myAlleles.add(alternateAllele);
|
||||
alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele));
|
||||
}
|
||||
|
|
@ -449,7 +457,7 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
// note the math.abs is necessary because -10 * 0.0 => -0.0 which isn't nice
|
||||
final double phredScaledConfidence =
|
||||
Math.abs(! bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES
|
||||
Math.abs(! bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES || UAC.annotateAllSitesWithPLs
|
||||
? -10 * AFresult.getLog10PosteriorOfAFEq0()
|
||||
: -10 * AFresult.getLog10PosteriorOfAFGT0());
|
||||
|
||||
|
|
|
|||
|
|
@ -156,6 +156,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void emitPLsAtAllSites() {
|
||||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --output_mode EMIT_ALL_SITES -allSitePLs", 1,
|
||||
Arrays.asList("7cc55db8693759e059a05bc4398f6f69"));
|
||||
executeTest("test all site PLs 1", spec1);
|
||||
|
||||
}
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// testing heterozygosity
|
||||
|
|
|
|||
|
|
@ -52,6 +52,5 @@ class CollectGcBiasMetrics extends org.broadinstitute.sting.queue.function.JavaC
|
|||
override def commandLine = super.commandLine +
|
||||
required("SUMMARY_OUTPUT=" + output) +
|
||||
required("CHART_OUTPUT=" + output+".pdf") +
|
||||
required("REFERENCE_SEQUENCE=" + reference) +
|
||||
required("ASSUME_SORTED=true")
|
||||
required("REFERENCE_SEQUENCE=" + reference)
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue