Last feature request from Reich/Paavo labs: the allSitePLs feature in UG worked but not quite filled requirements. What's needed is the ability to have all 10 PLs for EVERY site, regardless of whether they are variant or not. Previous version only emitted the 10 PLs in reference sites. Problem is that, if all PLs are emitted in all sites and every single site is quad-allelic (only way to have the PLs printed out in a valid way) then the ability to filter variants and to use the INFO fields may be compromised.

So, compromise solution is to go back to having biallelic PLs but emit a new FORMAT field, called APL, which has the 10 values, but all other statistics and regular PLs are computed as before.
Note that integration test had to be disabled, as the BCF2 codec apparently doesn't support writing into genotype fields other than PL,DP,AD,GQ,FT and GT.
This commit is contained in:
Guillermo del Angel 2013-07-16 11:14:48 -04:00
parent 234f564009
commit 9dd109b79a
4 changed files with 10 additions and 8 deletions

View File

@ -147,13 +147,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
// if we only want variants, then we don't need to calculate genotype likelihoods
if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY )
return builder.make();
// if user requires all PLs at all sites, add all possible alt alleles
else if (UAC.annotateAllSitesWithPLs) {
for ( final byte base : BaseUtils.BASES ) {
if ( base != refBase )
alleles.add(Allele.create(base));
}
}
else
// otherwise, choose any alternate allele (it doesn't really matter)
@ -199,6 +192,8 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
final double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(myLikelihoods, false, true);
gb.PL(genotypeLikelihoods);
gb.DP(sampleData.depth);
if (UAC.annotateAllSitesWithPLs)
gb.attribute(UnifiedGenotyperEngine.PL_FOR_ALL_SNP_ALLELES_KEY,GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(allLikelihoods, false, true)));
genotypes.add(gb.make());
}

View File

@ -318,6 +318,9 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.REFSAMPLE_DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Total reference sample depth"));
}
if (UAC.annotateAllSitesWithPLs) {
headerInfo.add(new VCFFormatHeaderLine(UnifiedGenotyperEngine.PL_FOR_ALL_SNP_ALLELES_KEY, 10, VCFHeaderLineType.Integer, "Phred-scaled genotype likelihoods for all 4 possible bases regardless of whether there is statistical evidence for them. Ordering is always PL for AA AC CC GA GC GG TA TC TG TT."));
}
VCFStandardHeaderLines.addStandardInfoLines(headerInfo, true,
VCFConstants.DOWNSAMPLED_KEY,
VCFConstants.MLE_ALLELE_COUNT_KEY,

View File

@ -79,6 +79,7 @@ public class UnifiedGenotyperEngine {
private static final String GPSTRING = "GENERALPLOIDY";
public static final String NUMBER_OF_DISCOVERED_ALLELES_KEY = "NDA";
public static final String PL_FOR_ALL_SNP_ALLELES_KEY = "APL";
public static final double HUMAN_SNP_HETEROZYGOSITY = 1e-3;
public static final double HUMAN_INDEL_HETEROZYGOSITY = 1e-4;

View File

@ -164,7 +164,10 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void emitPLsAtAllSites() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --output_mode EMIT_ALL_SITES -allSitePLs", 1,
Arrays.asList("7cc55db8693759e059a05bc4398f6f69"));
Arrays.asList("552aced1b1ef7e4a554223f4719f9560"));
// GDA: TODO: BCF encoder/decoder doesn't seem to support non-standard values in genotype fields. IE even if there is a field defined in FORMAT and in the header the BCF2 encoder will still fail
spec1.disableShadowBCF();
executeTest("test all site PLs 1", spec1);
}