Fix all-bases-mode and genotype-mode in the UG and add integration tests for them.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2295 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-12-09 17:41:30 +00:00
parent 4e54b91ce4
commit 40c2d7a4bc
5 changed files with 50 additions and 27 deletions

View File

@ -44,9 +44,8 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
}
// are we above the lod threshold for emitting calls (and not in all-bases mode)?
if ( !ALL_BASE_MODE && (bestIsRef || phredScaledConfidence < CONFIDENCE_THRESHOLD) ) {
return new Pair<VariationCall, List<Genotype>>(null, null);
}
if ( !ALL_BASE_MODE && ((!GENOTYPE_MODE && bestIsRef) || phredScaledConfidence < CONFIDENCE_THRESHOLD) )
return new Pair<VariationCall, List<Genotype>>(null, null);
// generate the calls
List<Genotype> calls = genotypeCallsFromGenotypeLikelihoods(overall, ref, contexts);

View File

@ -51,22 +51,18 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
// if there are no non-ref bases...
if ( bestAlternateAllele == null ) {
// if we don't want all bases, then we can just return
if ( !ALL_BASE_MODE )
if ( !ALL_BASE_MODE && !GENOTYPE_MODE )
return new Pair<VariationCall, List<Genotype>>(null, null);
// otherwise, we care about the ref base
bestAlternateAllele = ref;
// TODO -- figure out what to do here!
// otherwise, choose any alternate allele (it doesn't really matter)
bestAlternateAllele = (ref != 'A' ? 'A' : 'C');
}
else {
initializeAlleleFrequencies(frequencyEstimationPoints);
initialize(ref, contexts, StratifiedContext.OVERALL);
calculateAlleleFrequencyPosteriors(ref, frequencyEstimationPoints, contexts, StratifiedContext.OVERALL);
calculatePofFs(ref, frequencyEstimationPoints);
}
initializeAlleleFrequencies(frequencyEstimationPoints);
initialize(ref, contexts, StratifiedContext.OVERALL);
calculateAlleleFrequencyPosteriors(ref, frequencyEstimationPoints, contexts, StratifiedContext.OVERALL);
calculatePofFs(ref, frequencyEstimationPoints);
// print out stats if we have a writer
if ( verboseWriter != null )
@ -311,19 +307,30 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
protected List<Genotype> makeGenotypeCalls(char ref, char alt, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc) {
// by default, we return no genotypes
return new ArrayList<Genotype>();
}
}
protected Pair<VariationCall, List<Genotype>> createCalls(RefMetaDataTracker tracker, char ref, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc, int frequencyEstimationPoints) {
// only need to look at the most likely alternate allele
int indexOfMax = BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele);
double phredScaledConfidence = QualityUtils.phredScaleErrorRate(alleleFrequencyPosteriors[indexOfMax][0]);
if ( Double.isInfinite(phredScaledConfidence) )
phredScaledConfidence = -10.0 * log10PofDgivenAFi[indexOfMax][0];
int bestAFguess = Utils.findIndexOfMaxEntry(alleleFrequencyPosteriors[indexOfMax]);
double phredScaledConfidence;
if ( bestAFguess != 0 ) {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(alleleFrequencyPosteriors[indexOfMax][0]);
if ( Double.isInfinite(phredScaledConfidence) )
phredScaledConfidence = -10.0 * log10PofDgivenAFi[indexOfMax][0];
} else {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofFs[indexOfMax]);
if ( Double.isInfinite(phredScaledConfidence) ) {
double sum = 0.0;
for (int i = 1; i < frequencyEstimationPoints; i++)
sum += log10PofDgivenAFi[indexOfMax][i];
phredScaledConfidence = -10.0 * sum;
}
}
// return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
if ( !ALL_BASE_MODE && (bestAFguess == 0 || phredScaledConfidence < CONFIDENCE_THRESHOLD) )
if ( !ALL_BASE_MODE && ((!GENOTYPE_MODE && bestAFguess == 0) || phredScaledConfidence < CONFIDENCE_THRESHOLD) )
return new Pair<VariationCall, List<Genotype>>(null, null);
// populate the sample-specific data
@ -333,8 +340,10 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
// *** note that calculating strand bias involves overwriting data structures, so we do that last
VariationCall locusdata = GenotypeWriterFactory.createSupportedCall(OUTPUT_FORMAT, ref, loc, bestAFguess == 0 ? VARIANT_TYPE.REFERENCE : VARIANT_TYPE.SNP);
if ( locusdata != null ) {
locusdata.addAlternateAllele(bestAlternateAllele.toString());
locusdata.setNonRefAlleleFrequency((double)bestAFguess / (double)(frequencyEstimationPoints-1));
if ( bestAFguess != 0 ) {
locusdata.addAlternateAllele(bestAlternateAllele.toString());
locusdata.setNonRefAlleleFrequency((double)bestAFguess / (double)(frequencyEstimationPoints-1));
}
if ( locusdata instanceof ConfidenceBacked ) {
((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence);
}

View File

@ -67,9 +67,8 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
}
// are we above the lod threshold for emitting calls (and not in all-bases mode)?
if ( !ALL_BASE_MODE && (bestIsRef || phredScaledConfidence < CONFIDENCE_THRESHOLD) ) {
return new Pair<VariationCall, List<Genotype>>(null, null);
}
if ( !ALL_BASE_MODE && ((!GENOTYPE_MODE && bestIsRef) || phredScaledConfidence < CONFIDENCE_THRESHOLD) )
return new Pair<VariationCall, List<Genotype>>(null, null);
// we can now create the genotype call object
GenotypeCall call = GenotypeWriterFactory.createSupportedGenotypeCall(OUTPUT_FORMAT, ref, context.getLocation());

View File

@ -48,10 +48,10 @@ public class UnifiedArgumentCollection {
@Argument(fullName = "variant_output_format", shortName = "vf", doc = "Format to be used to represent variants; default is VCF", required = false)
public GenotypeWriterFactory.GENOTYPE_FORMAT VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.VCF;
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false)
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false)
public boolean GENOTYPE = false;
@Argument(fullName = "output_all_callable_bases", shortName = "all_bases", doc = "Should we output nonconfident variant or confident ref calls too?", required = false)
@Argument(fullName = "output_all_callable_bases", shortName = "all_bases", doc = "Should we output all callable bases?", required = false)
public boolean ALL_BASES = false;
@Argument(fullName = "verbose_mode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)

View File

@ -101,6 +101,22 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
executeTest("testSingleSamplePilot2 - Joint Estimate", spec);
}
@Test
public void testGenotypeModeJoint() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -genotype -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,001,000 -bm empirical -gm JOINT_ESTIMATE -confidence 70", 1,
Arrays.asList("6971e0bfa524c0e006b3c3ccef52520a"));
executeTest("testGenotypeMode - Joint Estimate", spec);
}
@Test
public void testAllBasesModeJoint() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -all_bases -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,001,000 -bm empirical -gm JOINT_ESTIMATE -confidence 70", 1,
Arrays.asList("9f54482c1594bdd1e28b4cf2e51f944f"));
executeTest("testAllBasesMode - Joint Estimate", spec);
}
//@Test
//public void testGLF() {
// WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(