diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index d555463bc..6638fc7a8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -38,7 +38,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati for ( final Genotype genotype : genotypes ) { // we care only about variant calls with likelihoods - if ( genotype.isHomRef() ) + if ( !genotype.isHet() && !genotype.isHomVar() ) continue; AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index f9d30e0d1..22017a1ee 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -42,7 +42,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. - private static final boolean SIMPLE_GREEDY_GENOTYPER = false; private static final List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); private final boolean USE_MULTI_ALLELIC_CALCULATION; @@ -592,10 +591,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { GenotypesContext GLs = vc.getGenotypes(); double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1]; - int[][] tracebackArray = new int[GLs.size()+1][AFofMaxLikelihood+1]; ArrayList sampleIndices = new ArrayList(); - int sampleIdx = 0; // todo - optimize initialization for (int k=0; k <= AFofMaxLikelihood; k++) @@ -604,83 +601,29 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { pathMetricArray[0][0] = 0.0; - // todo = can't deal with optimal dynamic programming solution with multiallelic records - if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) { - sampleIndices.addAll(GLs.getSampleNamesOrderedByName()); - sampleIdx = GLs.size(); - } - else { - - for ( final Genotype genotype : GLs.iterateInSampleNameOrder() ) { - if ( !genotype.hasLikelihoods() ) - continue; - - double[] likelihoods = genotype.getLikelihoods().getAsVector(); - - if (MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL) { - //System.out.print(sample.getKey()+":"); - //for (int k=0; k < likelihoods.length; k++) - // System.out.format("%4.2f ",likelihoods[k]); - //System.out.println(); - // all likelihoods are essentially the same: skip this sample and will later on force no call. - //sampleIdx++; - continue; - } - - sampleIndices.add(genotype.getSampleName()); - - for (int k=0; k <= AFofMaxLikelihood; k++) { - - double bestMetric = pathMetricArray[sampleIdx][k] + likelihoods[0]; - int bestIndex = k; - - if (k>0) { - double m2 = pathMetricArray[sampleIdx][k-1] + likelihoods[1]; - if (m2 > bestMetric) { - bestMetric = m2; - bestIndex = k-1; - } - } - - if (k>1) { - double m2 = pathMetricArray[sampleIdx][k-2] + likelihoods[2]; - if (m2 > bestMetric) { - bestMetric = m2; - bestIndex = k-2; - } - } - - pathMetricArray[sampleIdx+1][k] = bestMetric; - tracebackArray[sampleIdx+1][k] = bestIndex; - } - sampleIdx++; - } - } + sampleIndices.addAll(GLs.getSampleNamesOrderedByName()); GenotypesContext calls = GenotypesContext.create(); - int startIdx = AFofMaxLikelihood; - for (int k = sampleIdx; k > 0; k--) { + for (int k = GLs.size(); k > 0; k--) { int bestGTguess; String sample = sampleIndices.get(k-1); Genotype g = GLs.get(sample); if ( !g.hasLikelihoods() ) continue; - // if all likelihoods are essentially the same: we want to force no-call. In this case, we skip this sample for now, - // and will add no-call genotype to GL's in a second pass + ArrayList myAlleles = new ArrayList(); double[] likelihoods = g.getLikelihoods().getAsVector(); - if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) { - bestGTguess = Utils.findIndexOfMaxEntry(likelihoods); - } - else { - int newIdx = tracebackArray[k][startIdx];; - bestGTguess = startIdx - newIdx; - startIdx = newIdx; + // if there is no mass on the likelihoods, then just no-call the sample + if ( MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL ) { + calls.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false)); + continue; } + bestGTguess = Utils.findIndexOfMaxEntry(likelihoods); + // likelihoods are stored row-wise in lower triangular matrix. IE // for 2 alleles they have ordering AA,AB,BB // for 3 alleles they are ordered AA,AB,BB,AC,BC,CC @@ -709,33 +652,9 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } final double qual = GenotypeLikelihoods.getQualFromLikelihoods(bestGTguess, likelihoods); - //System.out.println(myAlleles.toString()); calls.add(new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false)); } - for ( final Genotype genotype : GLs.iterateInSampleNameOrder() ) { - if ( !genotype.hasLikelihoods() ) - continue; - - final Genotype g = GLs.get(genotype.getSampleName()); - final double[] likelihoods = genotype.getLikelihoods().getAsVector(); - - if (MathUtils.sum(likelihoods) <= SUM_GL_THRESH_NOCALL) - continue; // regular likelihoods - - final double qual = Genotype.NO_LOG10_PERROR; - calls.replace(new Genotype(g.getSampleName(), NO_CALL_ALLELES, qual, null, g.getAttributes(), false)); - } - return calls; } - - private final static void printLikelihoods(int numChr, double[][] logYMatrix, double[] log10AlleleFrequencyPriors) { - int j = logYMatrix.length - 1; - System.out.printf("-----------------------------------%n"); - for (int k=0; k <= numChr; k++) { - double posterior = logYMatrix[j][k] + log10AlleleFrequencyPriors[k]; - System.out.printf(" %4d\t%8.2f\t%8.2f\t%8.2f%n", k, logYMatrix[j][k], log10AlleleFrequencyPriors[k], posterior); - } - } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index a91b6c15d..c04b0085c 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("f6ef10dee80f9ccd7d245a28787ca887")); + Arrays.asList("a2d3839c4ebb390b0012d495e4e53b3a")); executeTest("test MultiSample Pilot1", spec); } @@ -44,7 +44,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("d0593483e85a7d815f4c5ee6db284d2a")); + Arrays.asList("43e7a17d95b1a0cf72e669657794d802")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } @@ -52,7 +52,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("3ccce5d909f8f128e496f6841836e5f7")); + Arrays.asList("ae29b9c9aacce8046dc780430540cd62")); executeTest("test SingleSample Pilot2", spec); } @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "890143b366050e78d6c6ba6b2c6b6864"; + private final static String COMPRESSED_OUTPUT_MD5 = "fda341de80b3f6fd42a83352b18b1d65"; @Test public void testCompressedOutput() { @@ -83,7 +83,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - String md5 = "95614280c565ad90f8c000376fef822c"; + String md5 = "32a34362dff51d8b73a3335048516d82"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -164,8 +164,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "46243ecc2b9dc716f48ea280c9bb7e72" ); - e.put( 1.0 / 1850, "6b2a59dbc76984db6d4d6d6b5ee5d62c" ); + e.put( 0.01, "2cb2544739e01f6c08fd820112914317" ); + e.put( 1.0 / 1850, "730b2b83a4b1f6d46fc3b5cd7d90756c" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -275,7 +275,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("6e182a58472ea17c8b0eb01f80562fbd")); + Arrays.asList("45633d905136c86e9d3f90ce613255e5")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); } @@ -285,7 +285,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1, - Arrays.asList("1d4a6a1b840ca6a130516ab9f2d99869")); + Arrays.asList("75e49dff01763aff2984dc86a72eb229")); executeTest("test MultiSample Pilot2 indels with complicated records", spec3); } @@ -294,7 +294,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, - Arrays.asList("6ee2f3c6b5422f0a2ad0669639e293cb")); + Arrays.asList("8209a308d95659c6da7dab8733c736f9")); executeTest("test MultiSample Phase1 indels with complicated records", spec4); }