Fix all-bases-mode and genotype-mode in the UG and add integration tests for them.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2295 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4e54b91ce4
commit
40c2d7a4bc
|
|
@ -44,9 +44,8 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
|
||||||
}
|
}
|
||||||
|
|
||||||
// are we above the lod threshold for emitting calls (and not in all-bases mode)?
|
// are we above the lod threshold for emitting calls (and not in all-bases mode)?
|
||||||
if ( !ALL_BASE_MODE && (bestIsRef || phredScaledConfidence < CONFIDENCE_THRESHOLD) ) {
|
if ( !ALL_BASE_MODE && ((!GENOTYPE_MODE && bestIsRef) || phredScaledConfidence < CONFIDENCE_THRESHOLD) )
|
||||||
return new Pair<VariationCall, List<Genotype>>(null, null);
|
return new Pair<VariationCall, List<Genotype>>(null, null);
|
||||||
}
|
|
||||||
|
|
||||||
// generate the calls
|
// generate the calls
|
||||||
List<Genotype> calls = genotypeCallsFromGenotypeLikelihoods(overall, ref, contexts);
|
List<Genotype> calls = genotypeCallsFromGenotypeLikelihoods(overall, ref, contexts);
|
||||||
|
|
|
||||||
|
|
@ -51,22 +51,18 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
|
||||||
// if there are no non-ref bases...
|
// if there are no non-ref bases...
|
||||||
if ( bestAlternateAllele == null ) {
|
if ( bestAlternateAllele == null ) {
|
||||||
// if we don't want all bases, then we can just return
|
// if we don't want all bases, then we can just return
|
||||||
if ( !ALL_BASE_MODE )
|
if ( !ALL_BASE_MODE && !GENOTYPE_MODE )
|
||||||
return new Pair<VariationCall, List<Genotype>>(null, null);
|
return new Pair<VariationCall, List<Genotype>>(null, null);
|
||||||
|
|
||||||
// otherwise, we care about the ref base
|
// otherwise, choose any alternate allele (it doesn't really matter)
|
||||||
bestAlternateAllele = ref;
|
bestAlternateAllele = (ref != 'A' ? 'A' : 'C');
|
||||||
|
|
||||||
// TODO -- figure out what to do here!
|
|
||||||
|
|
||||||
}
|
}
|
||||||
else {
|
|
||||||
initializeAlleleFrequencies(frequencyEstimationPoints);
|
|
||||||
|
|
||||||
initialize(ref, contexts, StratifiedContext.OVERALL);
|
initializeAlleleFrequencies(frequencyEstimationPoints);
|
||||||
calculateAlleleFrequencyPosteriors(ref, frequencyEstimationPoints, contexts, StratifiedContext.OVERALL);
|
|
||||||
calculatePofFs(ref, frequencyEstimationPoints);
|
initialize(ref, contexts, StratifiedContext.OVERALL);
|
||||||
}
|
calculateAlleleFrequencyPosteriors(ref, frequencyEstimationPoints, contexts, StratifiedContext.OVERALL);
|
||||||
|
calculatePofFs(ref, frequencyEstimationPoints);
|
||||||
|
|
||||||
// print out stats if we have a writer
|
// print out stats if we have a writer
|
||||||
if ( verboseWriter != null )
|
if ( verboseWriter != null )
|
||||||
|
|
@ -311,19 +307,30 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
|
||||||
protected List<Genotype> makeGenotypeCalls(char ref, char alt, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc) {
|
protected List<Genotype> makeGenotypeCalls(char ref, char alt, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc) {
|
||||||
// by default, we return no genotypes
|
// by default, we return no genotypes
|
||||||
return new ArrayList<Genotype>();
|
return new ArrayList<Genotype>();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Pair<VariationCall, List<Genotype>> createCalls(RefMetaDataTracker tracker, char ref, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc, int frequencyEstimationPoints) {
|
protected Pair<VariationCall, List<Genotype>> createCalls(RefMetaDataTracker tracker, char ref, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc, int frequencyEstimationPoints) {
|
||||||
// only need to look at the most likely alternate allele
|
// only need to look at the most likely alternate allele
|
||||||
int indexOfMax = BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele);
|
int indexOfMax = BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele);
|
||||||
|
|
||||||
double phredScaledConfidence = QualityUtils.phredScaleErrorRate(alleleFrequencyPosteriors[indexOfMax][0]);
|
|
||||||
if ( Double.isInfinite(phredScaledConfidence) )
|
|
||||||
phredScaledConfidence = -10.0 * log10PofDgivenAFi[indexOfMax][0];
|
|
||||||
int bestAFguess = Utils.findIndexOfMaxEntry(alleleFrequencyPosteriors[indexOfMax]);
|
int bestAFguess = Utils.findIndexOfMaxEntry(alleleFrequencyPosteriors[indexOfMax]);
|
||||||
|
double phredScaledConfidence;
|
||||||
|
if ( bestAFguess != 0 ) {
|
||||||
|
phredScaledConfidence = QualityUtils.phredScaleErrorRate(alleleFrequencyPosteriors[indexOfMax][0]);
|
||||||
|
if ( Double.isInfinite(phredScaledConfidence) )
|
||||||
|
phredScaledConfidence = -10.0 * log10PofDgivenAFi[indexOfMax][0];
|
||||||
|
} else {
|
||||||
|
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofFs[indexOfMax]);
|
||||||
|
if ( Double.isInfinite(phredScaledConfidence) ) {
|
||||||
|
double sum = 0.0;
|
||||||
|
for (int i = 1; i < frequencyEstimationPoints; i++)
|
||||||
|
sum += log10PofDgivenAFi[indexOfMax][i];
|
||||||
|
phredScaledConfidence = -10.0 * sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
|
// return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
|
||||||
if ( !ALL_BASE_MODE && (bestAFguess == 0 || phredScaledConfidence < CONFIDENCE_THRESHOLD) )
|
if ( !ALL_BASE_MODE && ((!GENOTYPE_MODE && bestAFguess == 0) || phredScaledConfidence < CONFIDENCE_THRESHOLD) )
|
||||||
return new Pair<VariationCall, List<Genotype>>(null, null);
|
return new Pair<VariationCall, List<Genotype>>(null, null);
|
||||||
|
|
||||||
// populate the sample-specific data
|
// populate the sample-specific data
|
||||||
|
|
@ -333,8 +340,10 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
|
||||||
// *** note that calculating strand bias involves overwriting data structures, so we do that last
|
// *** note that calculating strand bias involves overwriting data structures, so we do that last
|
||||||
VariationCall locusdata = GenotypeWriterFactory.createSupportedCall(OUTPUT_FORMAT, ref, loc, bestAFguess == 0 ? VARIANT_TYPE.REFERENCE : VARIANT_TYPE.SNP);
|
VariationCall locusdata = GenotypeWriterFactory.createSupportedCall(OUTPUT_FORMAT, ref, loc, bestAFguess == 0 ? VARIANT_TYPE.REFERENCE : VARIANT_TYPE.SNP);
|
||||||
if ( locusdata != null ) {
|
if ( locusdata != null ) {
|
||||||
locusdata.addAlternateAllele(bestAlternateAllele.toString());
|
if ( bestAFguess != 0 ) {
|
||||||
locusdata.setNonRefAlleleFrequency((double)bestAFguess / (double)(frequencyEstimationPoints-1));
|
locusdata.addAlternateAllele(bestAlternateAllele.toString());
|
||||||
|
locusdata.setNonRefAlleleFrequency((double)bestAFguess / (double)(frequencyEstimationPoints-1));
|
||||||
|
}
|
||||||
if ( locusdata instanceof ConfidenceBacked ) {
|
if ( locusdata instanceof ConfidenceBacked ) {
|
||||||
((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence);
|
((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -67,9 +67,8 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
|
||||||
}
|
}
|
||||||
|
|
||||||
// are we above the lod threshold for emitting calls (and not in all-bases mode)?
|
// are we above the lod threshold for emitting calls (and not in all-bases mode)?
|
||||||
if ( !ALL_BASE_MODE && (bestIsRef || phredScaledConfidence < CONFIDENCE_THRESHOLD) ) {
|
if ( !ALL_BASE_MODE && ((!GENOTYPE_MODE && bestIsRef) || phredScaledConfidence < CONFIDENCE_THRESHOLD) )
|
||||||
return new Pair<VariationCall, List<Genotype>>(null, null);
|
return new Pair<VariationCall, List<Genotype>>(null, null);
|
||||||
}
|
|
||||||
|
|
||||||
// we can now create the genotype call object
|
// we can now create the genotype call object
|
||||||
GenotypeCall call = GenotypeWriterFactory.createSupportedGenotypeCall(OUTPUT_FORMAT, ref, context.getLocation());
|
GenotypeCall call = GenotypeWriterFactory.createSupportedGenotypeCall(OUTPUT_FORMAT, ref, context.getLocation());
|
||||||
|
|
|
||||||
|
|
@ -48,10 +48,10 @@ public class UnifiedArgumentCollection {
|
||||||
@Argument(fullName = "variant_output_format", shortName = "vf", doc = "Format to be used to represent variants; default is VCF", required = false)
|
@Argument(fullName = "variant_output_format", shortName = "vf", doc = "Format to be used to represent variants; default is VCF", required = false)
|
||||||
public GenotypeWriterFactory.GENOTYPE_FORMAT VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.VCF;
|
public GenotypeWriterFactory.GENOTYPE_FORMAT VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.VCF;
|
||||||
|
|
||||||
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false)
|
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false)
|
||||||
public boolean GENOTYPE = false;
|
public boolean GENOTYPE = false;
|
||||||
|
|
||||||
@Argument(fullName = "output_all_callable_bases", shortName = "all_bases", doc = "Should we output nonconfident variant or confident ref calls too?", required = false)
|
@Argument(fullName = "output_all_callable_bases", shortName = "all_bases", doc = "Should we output all callable bases?", required = false)
|
||||||
public boolean ALL_BASES = false;
|
public boolean ALL_BASES = false;
|
||||||
|
|
||||||
@Argument(fullName = "verbose_mode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
|
@Argument(fullName = "verbose_mode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
|
||||||
|
|
|
||||||
|
|
@ -101,6 +101,22 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
||||||
executeTest("testSingleSamplePilot2 - Joint Estimate", spec);
|
executeTest("testSingleSamplePilot2 - Joint Estimate", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGenotypeModeJoint() {
|
||||||
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
|
"-T UnifiedGenotyper -genotype -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,001,000 -bm empirical -gm JOINT_ESTIMATE -confidence 70", 1,
|
||||||
|
Arrays.asList("6971e0bfa524c0e006b3c3ccef52520a"));
|
||||||
|
executeTest("testGenotypeMode - Joint Estimate", spec);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testAllBasesModeJoint() {
|
||||||
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
|
"-T UnifiedGenotyper -all_bases -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,001,000 -bm empirical -gm JOINT_ESTIMATE -confidence 70", 1,
|
||||||
|
Arrays.asList("9f54482c1594bdd1e28b4cf2e51f944f"));
|
||||||
|
executeTest("testAllBasesMode - Joint Estimate", spec);
|
||||||
|
}
|
||||||
|
|
||||||
//@Test
|
//@Test
|
||||||
//public void testGLF() {
|
//public void testGLF() {
|
||||||
// WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
// WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue