No dynamic programming solution for assignning genotypes; just done greedily now. Fixed QualByDepth to skip no-call genotypes. No-calls are no longer given annotations (attributes).

This commit is contained in:
Eric Banks 2011-12-09 02:25:06 -05:00
parent 2fe50c64da
commit aa4a8c5303
3 changed files with 20 additions and 101 deletions

View File

@ -38,7 +38,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
for ( final Genotype genotype : genotypes ) {
// we care only about variant calls with likelihoods
if ( genotype.isHomRef() )
if ( !genotype.isHet() && !genotype.isHomVar() )
continue;
AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());

View File

@ -42,7 +42,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
private static final boolean SIMPLE_GREEDY_GENOTYPER = false;
private static final List<Allele> NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
private final boolean USE_MULTI_ALLELIC_CALCULATION;
@ -592,10 +591,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
GenotypesContext GLs = vc.getGenotypes();
double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1];
int[][] tracebackArray = new int[GLs.size()+1][AFofMaxLikelihood+1];
ArrayList<String> sampleIndices = new ArrayList<String>();
int sampleIdx = 0;
// todo - optimize initialization
for (int k=0; k <= AFofMaxLikelihood; k++)
@ -604,83 +601,29 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
pathMetricArray[0][0] = 0.0;
// todo = can't deal with optimal dynamic programming solution with multiallelic records
if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
sampleIndices.addAll(GLs.getSampleNamesOrderedByName());
sampleIdx = GLs.size();
}
else {
for ( final Genotype genotype : GLs.iterateInSampleNameOrder() ) {
if ( !genotype.hasLikelihoods() )
continue;
double[] likelihoods = genotype.getLikelihoods().getAsVector();
if (MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL) {
//System.out.print(sample.getKey()+":");
//for (int k=0; k < likelihoods.length; k++)
// System.out.format("%4.2f ",likelihoods[k]);
//System.out.println();
// all likelihoods are essentially the same: skip this sample and will later on force no call.
//sampleIdx++;
continue;
}
sampleIndices.add(genotype.getSampleName());
for (int k=0; k <= AFofMaxLikelihood; k++) {
double bestMetric = pathMetricArray[sampleIdx][k] + likelihoods[0];
int bestIndex = k;
if (k>0) {
double m2 = pathMetricArray[sampleIdx][k-1] + likelihoods[1];
if (m2 > bestMetric) {
bestMetric = m2;
bestIndex = k-1;
}
}
if (k>1) {
double m2 = pathMetricArray[sampleIdx][k-2] + likelihoods[2];
if (m2 > bestMetric) {
bestMetric = m2;
bestIndex = k-2;
}
}
pathMetricArray[sampleIdx+1][k] = bestMetric;
tracebackArray[sampleIdx+1][k] = bestIndex;
}
sampleIdx++;
}
}
sampleIndices.addAll(GLs.getSampleNamesOrderedByName());
GenotypesContext calls = GenotypesContext.create();
int startIdx = AFofMaxLikelihood;
for (int k = sampleIdx; k > 0; k--) {
for (int k = GLs.size(); k > 0; k--) {
int bestGTguess;
String sample = sampleIndices.get(k-1);
Genotype g = GLs.get(sample);
if ( !g.hasLikelihoods() )
continue;
// if all likelihoods are essentially the same: we want to force no-call. In this case, we skip this sample for now,
// and will add no-call genotype to GL's in a second pass
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
double[] likelihoods = g.getLikelihoods().getAsVector();
if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
bestGTguess = Utils.findIndexOfMaxEntry(likelihoods);
}
else {
int newIdx = tracebackArray[k][startIdx];;
bestGTguess = startIdx - newIdx;
startIdx = newIdx;
// if there is no mass on the likelihoods, then just no-call the sample
if ( MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL ) {
calls.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false));
continue;
}
bestGTguess = Utils.findIndexOfMaxEntry(likelihoods);
// likelihoods are stored row-wise in lower triangular matrix. IE
// for 2 alleles they have ordering AA,AB,BB
// for 3 alleles they are ordered AA,AB,BB,AC,BC,CC
@ -709,33 +652,9 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
final double qual = GenotypeLikelihoods.getQualFromLikelihoods(bestGTguess, likelihoods);
//System.out.println(myAlleles.toString());
calls.add(new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false));
}
for ( final Genotype genotype : GLs.iterateInSampleNameOrder() ) {
if ( !genotype.hasLikelihoods() )
continue;
final Genotype g = GLs.get(genotype.getSampleName());
final double[] likelihoods = genotype.getLikelihoods().getAsVector();
if (MathUtils.sum(likelihoods) <= SUM_GL_THRESH_NOCALL)
continue; // regular likelihoods
final double qual = Genotype.NO_LOG10_PERROR;
calls.replace(new Genotype(g.getSampleName(), NO_CALL_ALLELES, qual, null, g.getAttributes(), false));
}
return calls;
}
private final static void printLikelihoods(int numChr, double[][] logYMatrix, double[] log10AlleleFrequencyPriors) {
int j = logYMatrix.length - 1;
System.out.printf("-----------------------------------%n");
for (int k=0; k <= numChr; k++) {
double posterior = logYMatrix[j][k] + log10AlleleFrequencyPriors[k];
System.out.printf(" %4d\t%8.2f\t%8.2f\t%8.2f%n", k, logYMatrix[j][k], log10AlleleFrequencyPriors[k], posterior);
}
}
}

View File

@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("f6ef10dee80f9ccd7d245a28787ca887"));
Arrays.asList("a2d3839c4ebb390b0012d495e4e53b3a"));
executeTest("test MultiSample Pilot1", spec);
}
@ -44,7 +44,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testWithAllelesPassedIn2() {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
Arrays.asList("d0593483e85a7d815f4c5ee6db284d2a"));
Arrays.asList("43e7a17d95b1a0cf72e669657794d802"));
executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2);
}
@ -52,7 +52,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSingleSamplePilot2() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
Arrays.asList("3ccce5d909f8f128e496f6841836e5f7"));
Arrays.asList("ae29b9c9aacce8046dc780430540cd62"));
executeTest("test SingleSample Pilot2", spec);
}
@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
//
// --------------------------------------------------------------------------------------------------------------
private final static String COMPRESSED_OUTPUT_MD5 = "890143b366050e78d6c6ba6b2c6b6864";
private final static String COMPRESSED_OUTPUT_MD5 = "fda341de80b3f6fd42a83352b18b1d65";
@Test
public void testCompressedOutput() {
@ -83,7 +83,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
// Note that we need to turn off any randomization for this to work, so no downsampling and no annotations
String md5 = "95614280c565ad90f8c000376fef822c";
String md5 = "32a34362dff51d8b73a3335048516d82";
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1,
@ -164,8 +164,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testHeterozyosity() {
HashMap<Double, String> e = new HashMap<Double, String>();
e.put( 0.01, "46243ecc2b9dc716f48ea280c9bb7e72" );
e.put( 1.0 / 1850, "6b2a59dbc76984db6d4d6d6b5ee5d62c" );
e.put( 0.01, "2cb2544739e01f6c08fd820112914317" );
e.put( 1.0 / 1850, "730b2b83a4b1f6d46fc3b5cd7d90756c" );
for ( Map.Entry<Double, String> entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@ -275,7 +275,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
+ validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("6e182a58472ea17c8b0eb01f80562fbd"));
Arrays.asList("45633d905136c86e9d3f90ce613255e5"));
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2);
}
@ -285,7 +285,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1,
Arrays.asList("1d4a6a1b840ca6a130516ab9f2d99869"));
Arrays.asList("75e49dff01763aff2984dc86a72eb229"));
executeTest("test MultiSample Pilot2 indels with complicated records", spec3);
}
@ -294,7 +294,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec(
baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation +
"phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1,
Arrays.asList("6ee2f3c6b5422f0a2ad0669639e293cb"));
Arrays.asList("8209a308d95659c6da7dab8733c736f9"));
executeTest("test MultiSample Phase1 indels with complicated records", spec4);
}