No dynamic programming solution for assignning genotypes; just done greedily now. Fixed QualByDepth to skip no-call genotypes. No-calls are no longer given annotations (attributes).
This commit is contained in:
parent
2fe50c64da
commit
aa4a8c5303
|
|
@ -38,7 +38,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
|
|||
for ( final Genotype genotype : genotypes ) {
|
||||
|
||||
// we care only about variant calls with likelihoods
|
||||
if ( genotype.isHomRef() )
|
||||
if ( !genotype.isHet() && !genotype.isHomVar() )
|
||||
continue;
|
||||
|
||||
AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||
|
|
|
|||
|
|
@ -42,7 +42,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
|
||||
|
||||
private static final boolean SIMPLE_GREEDY_GENOTYPER = false;
|
||||
private static final List<Allele> NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
|
||||
|
||||
private final boolean USE_MULTI_ALLELIC_CALCULATION;
|
||||
|
|
@ -592,10 +591,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
|
||||
GenotypesContext GLs = vc.getGenotypes();
|
||||
double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1];
|
||||
int[][] tracebackArray = new int[GLs.size()+1][AFofMaxLikelihood+1];
|
||||
|
||||
ArrayList<String> sampleIndices = new ArrayList<String>();
|
||||
int sampleIdx = 0;
|
||||
|
||||
// todo - optimize initialization
|
||||
for (int k=0; k <= AFofMaxLikelihood; k++)
|
||||
|
|
@ -604,83 +601,29 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
|
||||
pathMetricArray[0][0] = 0.0;
|
||||
|
||||
// todo = can't deal with optimal dynamic programming solution with multiallelic records
|
||||
if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
|
||||
sampleIndices.addAll(GLs.getSampleNamesOrderedByName());
|
||||
sampleIdx = GLs.size();
|
||||
}
|
||||
else {
|
||||
|
||||
for ( final Genotype genotype : GLs.iterateInSampleNameOrder() ) {
|
||||
if ( !genotype.hasLikelihoods() )
|
||||
continue;
|
||||
|
||||
double[] likelihoods = genotype.getLikelihoods().getAsVector();
|
||||
|
||||
if (MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL) {
|
||||
//System.out.print(sample.getKey()+":");
|
||||
//for (int k=0; k < likelihoods.length; k++)
|
||||
// System.out.format("%4.2f ",likelihoods[k]);
|
||||
//System.out.println();
|
||||
// all likelihoods are essentially the same: skip this sample and will later on force no call.
|
||||
//sampleIdx++;
|
||||
continue;
|
||||
}
|
||||
|
||||
sampleIndices.add(genotype.getSampleName());
|
||||
|
||||
for (int k=0; k <= AFofMaxLikelihood; k++) {
|
||||
|
||||
double bestMetric = pathMetricArray[sampleIdx][k] + likelihoods[0];
|
||||
int bestIndex = k;
|
||||
|
||||
if (k>0) {
|
||||
double m2 = pathMetricArray[sampleIdx][k-1] + likelihoods[1];
|
||||
if (m2 > bestMetric) {
|
||||
bestMetric = m2;
|
||||
bestIndex = k-1;
|
||||
}
|
||||
}
|
||||
|
||||
if (k>1) {
|
||||
double m2 = pathMetricArray[sampleIdx][k-2] + likelihoods[2];
|
||||
if (m2 > bestMetric) {
|
||||
bestMetric = m2;
|
||||
bestIndex = k-2;
|
||||
}
|
||||
}
|
||||
|
||||
pathMetricArray[sampleIdx+1][k] = bestMetric;
|
||||
tracebackArray[sampleIdx+1][k] = bestIndex;
|
||||
}
|
||||
sampleIdx++;
|
||||
}
|
||||
}
|
||||
sampleIndices.addAll(GLs.getSampleNamesOrderedByName());
|
||||
|
||||
GenotypesContext calls = GenotypesContext.create();
|
||||
|
||||
int startIdx = AFofMaxLikelihood;
|
||||
for (int k = sampleIdx; k > 0; k--) {
|
||||
for (int k = GLs.size(); k > 0; k--) {
|
||||
int bestGTguess;
|
||||
String sample = sampleIndices.get(k-1);
|
||||
Genotype g = GLs.get(sample);
|
||||
if ( !g.hasLikelihoods() )
|
||||
continue;
|
||||
// if all likelihoods are essentially the same: we want to force no-call. In this case, we skip this sample for now,
|
||||
// and will add no-call genotype to GL's in a second pass
|
||||
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||
|
||||
double[] likelihoods = g.getLikelihoods().getAsVector();
|
||||
|
||||
if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
|
||||
bestGTguess = Utils.findIndexOfMaxEntry(likelihoods);
|
||||
}
|
||||
else {
|
||||
int newIdx = tracebackArray[k][startIdx];;
|
||||
bestGTguess = startIdx - newIdx;
|
||||
startIdx = newIdx;
|
||||
// if there is no mass on the likelihoods, then just no-call the sample
|
||||
if ( MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL ) {
|
||||
calls.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false));
|
||||
continue;
|
||||
}
|
||||
|
||||
bestGTguess = Utils.findIndexOfMaxEntry(likelihoods);
|
||||
|
||||
// likelihoods are stored row-wise in lower triangular matrix. IE
|
||||
// for 2 alleles they have ordering AA,AB,BB
|
||||
// for 3 alleles they are ordered AA,AB,BB,AC,BC,CC
|
||||
|
|
@ -709,33 +652,9 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
}
|
||||
|
||||
final double qual = GenotypeLikelihoods.getQualFromLikelihoods(bestGTguess, likelihoods);
|
||||
//System.out.println(myAlleles.toString());
|
||||
calls.add(new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false));
|
||||
}
|
||||
|
||||
for ( final Genotype genotype : GLs.iterateInSampleNameOrder() ) {
|
||||
if ( !genotype.hasLikelihoods() )
|
||||
continue;
|
||||
|
||||
final Genotype g = GLs.get(genotype.getSampleName());
|
||||
final double[] likelihoods = genotype.getLikelihoods().getAsVector();
|
||||
|
||||
if (MathUtils.sum(likelihoods) <= SUM_GL_THRESH_NOCALL)
|
||||
continue; // regular likelihoods
|
||||
|
||||
final double qual = Genotype.NO_LOG10_PERROR;
|
||||
calls.replace(new Genotype(g.getSampleName(), NO_CALL_ALLELES, qual, null, g.getAttributes(), false));
|
||||
}
|
||||
|
||||
return calls;
|
||||
}
|
||||
|
||||
private final static void printLikelihoods(int numChr, double[][] logYMatrix, double[] log10AlleleFrequencyPriors) {
|
||||
int j = logYMatrix.length - 1;
|
||||
System.out.printf("-----------------------------------%n");
|
||||
for (int k=0; k <= numChr; k++) {
|
||||
double posterior = logYMatrix[j][k] + log10AlleleFrequencyPriors[k];
|
||||
System.out.printf(" %4d\t%8.2f\t%8.2f\t%8.2f%n", k, logYMatrix[j][k], log10AlleleFrequencyPriors[k], posterior);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSamplePilot1() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
|
||||
Arrays.asList("f6ef10dee80f9ccd7d245a28787ca887"));
|
||||
Arrays.asList("a2d3839c4ebb390b0012d495e4e53b3a"));
|
||||
executeTest("test MultiSample Pilot1", spec);
|
||||
}
|
||||
|
||||
|
|
@ -44,7 +44,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testWithAllelesPassedIn2() {
|
||||
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
|
||||
Arrays.asList("d0593483e85a7d815f4c5ee6db284d2a"));
|
||||
Arrays.asList("43e7a17d95b1a0cf72e669657794d802"));
|
||||
executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2);
|
||||
}
|
||||
|
||||
|
|
@ -52,7 +52,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testSingleSamplePilot2() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
|
||||
Arrays.asList("3ccce5d909f8f128e496f6841836e5f7"));
|
||||
Arrays.asList("ae29b9c9aacce8046dc780430540cd62"));
|
||||
executeTest("test SingleSample Pilot2", spec);
|
||||
}
|
||||
|
||||
|
|
@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
private final static String COMPRESSED_OUTPUT_MD5 = "890143b366050e78d6c6ba6b2c6b6864";
|
||||
private final static String COMPRESSED_OUTPUT_MD5 = "fda341de80b3f6fd42a83352b18b1d65";
|
||||
|
||||
@Test
|
||||
public void testCompressedOutput() {
|
||||
|
|
@ -83,7 +83,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
|
||||
// Note that we need to turn off any randomization for this to work, so no downsampling and no annotations
|
||||
|
||||
String md5 = "95614280c565ad90f8c000376fef822c";
|
||||
String md5 = "32a34362dff51d8b73a3335048516d82";
|
||||
|
||||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1,
|
||||
|
|
@ -164,8 +164,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testHeterozyosity() {
|
||||
HashMap<Double, String> e = new HashMap<Double, String>();
|
||||
e.put( 0.01, "46243ecc2b9dc716f48ea280c9bb7e72" );
|
||||
e.put( 1.0 / 1850, "6b2a59dbc76984db6d4d6d6b5ee5d62c" );
|
||||
e.put( 0.01, "2cb2544739e01f6c08fd820112914317" );
|
||||
e.put( 1.0 / 1850, "730b2b83a4b1f6d46fc3b5cd7d90756c" );
|
||||
|
||||
for ( Map.Entry<Double, String> entry : e.entrySet() ) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
|
|
@ -275,7 +275,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
|
||||
+ validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation +
|
||||
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
|
||||
Arrays.asList("6e182a58472ea17c8b0eb01f80562fbd"));
|
||||
Arrays.asList("45633d905136c86e9d3f90ce613255e5"));
|
||||
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2);
|
||||
}
|
||||
|
||||
|
|
@ -285,7 +285,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation +
|
||||
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1,
|
||||
Arrays.asList("1d4a6a1b840ca6a130516ab9f2d99869"));
|
||||
Arrays.asList("75e49dff01763aff2984dc86a72eb229"));
|
||||
executeTest("test MultiSample Pilot2 indels with complicated records", spec3);
|
||||
}
|
||||
|
||||
|
|
@ -294,7 +294,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation +
|
||||
"phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1,
|
||||
Arrays.asList("6ee2f3c6b5422f0a2ad0669639e293cb"));
|
||||
Arrays.asList("8209a308d95659c6da7dab8733c736f9"));
|
||||
executeTest("test MultiSample Phase1 indels with complicated records", spec4);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue