- Removed max_coverage argument from UG; Aaron will set it up so that we don't call when the GATK had to drop reads.

- Reimplemented optimization in UG to not call when there are no non-ref bases.
- Compute reference confidence accurately in UG for ref calls.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2693 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-01-26 21:56:33 +00:00
parent 2c8d7b0c44
commit 47440bc029
4 changed files with 58 additions and 31 deletions

View File

@ -48,17 +48,17 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
// if there are no non-ref bases...
if ( bestAlternateAllele == null ) {
// if we don't want all bases, then we can just return
// todo -- we still need to calculate the confidence in the reference base.
// todo -- we can still include this optimization, but we should calculate a confidence score
// if ( !ALL_BASE_MODE && !GENOTYPE_MODE )
// return new VariantCallContext(false);
// if we don't want all bases, then we don't need to calculate genotype likelihoods
if ( !ALL_BASE_MODE && !GENOTYPE_MODE ) {
VariantCallContext vcc = new VariantCallContext(false);
estimateReferenceConfidence(vcc, contexts, DiploidGenotypePriors.HUMAN_HETEROZYGOSITY, false);
return vcc;
}
// otherwise, choose any alternate allele (it doesn't really matter)
bestAlternateAllele = (ref != 'A' ? 'A' : 'C');
}
// calculate likelihoods if there are non-ref bases
initializeAlleleFrequencies(frequencyEstimationPoints);
initialize(ref, contexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
@ -69,8 +69,14 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
if ( verboseWriter != null )
printAlleleFrequencyData(ref, loc, frequencyEstimationPoints);
return createCalls(tracker, ref, contexts, loc, frequencyEstimationPoints);
}
VariantCallContext vcc = createCalls(tracker, ref, contexts, loc, frequencyEstimationPoints);
// technically, at this point our confidence in a reference call isn't accurately
// estimated because it didn't take into account samples with no data
if ( vcc.variation == null )
estimateReferenceConfidence(vcc, contexts, DiploidGenotypePriors.HUMAN_HETEROZYGOSITY, true);
return vcc;
}
protected int getNSamples(Map<String, StratifiedAlignmentContext> contexts) {
return contexts.size();
@ -151,6 +157,27 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
return AFs;
}
private void estimateReferenceConfidence(VariantCallContext vcc, Map<String, StratifiedAlignmentContext> contexts, double theta, boolean ignoreCoveredSamples) {
double P_of_ref = 1.0;
// use the AF=0 prob if it's calculated
if ( ignoreCoveredSamples )
P_of_ref = 1.0 - PofFs[BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele)];
// for each sample that we haven't examined yet
for ( String sample : samples ) {
boolean isCovered = contexts.containsKey(sample);
if ( ignoreCoveredSamples && isCovered )
continue;
int depth = isCovered ? contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup().size() : 0;
P_of_ref *= 1.0 - (theta / 2.0) * MathUtils.binomialProbability(0, depth, 0.5);
}
vcc.confidentlyCalled = QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= CONFIDENCE_THRESHOLD;
}
protected void calculateAlleleFrequencyPosteriors(char ref, int frequencyEstimationPoints, Map<String, StratifiedAlignmentContext> contexts, StratifiedAlignmentContext.StratifiedContextType contextType) {
// initialization

View File

@ -83,7 +83,4 @@ public class UnifiedArgumentCollection {
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
public Double MAX_DELETION_FRACTION = 0.05;
@Argument(fullName = "max_coverage", shortName = "mc", doc = "Maximum reads at this locus (after filtering bad bases/reads) for it to be callable; to disable, provide value < 1 [default:10,000]", required = false)
public Integer MAX_READS_IN_PILEUP = 100000;
}

View File

@ -260,15 +260,19 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
if ( !BaseUtils.isRegularBase(ref) )
return null;
ReadBackedPileup rawPileup = rawContext.getBasePileup();
// don't try to call if we couldn't read in all reads at this locus (since it wasn't properly downsampled)
if ( rawPileup.size() == getToolkit().getArguments().readMaxPileup )
return null;
// filter the context based on min base and mapping qualities
ReadBackedPileup pileup = rawContext.getBasePileup().getBaseFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE);
ReadBackedPileup pileup = rawPileup.getBaseAndMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE, UAC.MIN_MAPPING_QUALTY_SCORE);
// filter the context based on mapping quality and mismatch rate
pileup = filterPileup(pileup, refContext, UAC);
// an optimization to speed things up when there is no coverage or when overly covered
if ( pileup.size() == 0 ||
(UAC.MAX_READS_IN_PILEUP > 0 && pileup.size() > UAC.MAX_READS_IN_PILEUP) )
// don't call when there is no coverage
if ( pileup.size() == 0 )
return null;
// are there too many deletions in the pileup?
@ -306,8 +310,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
ArrayList<PileupElement> filteredPileup = new ArrayList<PileupElement>();
for ( PileupElement p : pileup ) {
if ( p.getMappingQual() >= UAC.MIN_MAPPING_QUALTY_SCORE &&
(UAC.USE_BADLY_MATED_READS || !p.getRead().getReadPairedFlag() || p.getRead().getMateUnmappedFlag() || p.getRead().getMateReferenceIndex() == p.getRead().getReferenceIndex()) &&
if ( (UAC.USE_BADLY_MATED_READS || !p.getRead().getReadPairedFlag() || p.getRead().getMateUnmappedFlag() || p.getRead().getMateReferenceIndex() == p.getRead().getReferenceIndex()) &&
AlignmentUtils.mismatchesInRefWindow(p, refContext, true) <= UAC.MAX_MISMATCHES )
filteredPileup.add(p);
}

View File

@ -22,7 +22,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1PointEM() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1,
Arrays.asList("46232790dae2e99e79626de78836ba08"));
Arrays.asList("5ad3f97c886a3381821366caa9162c12"));
executeTest("testMultiSamplePilot1 - Point Estimate EM", spec);
}
@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot2PointEM() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1,
Arrays.asList("683cfa48cc5e2d140286645873184c20"));
Arrays.asList("73c80566c8353958b2ac61932f0b3812"));
executeTest("testMultiSamplePilot2 - Point Estimate EM", spec);
}
@ -43,7 +43,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testPooled1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,000-10,024,000 -bm empirical -gm POOLED -ps 60 -confidence 30", 1,
Arrays.asList("f2b3799fe18010aa8cfd76b0e0782db7"));
Arrays.asList("5a86c53e8f0897a71ff74662c5696dc4"));
executeTest("testPooled1", spec);
}
@ -56,7 +56,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1Joint() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,022,000-10,025,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
Arrays.asList("42fc8d585f2f30dc6c58d413738e1f7c"));
Arrays.asList("5e0a92fbddeb9d6e35586d0488a1e5c7"));
executeTest("testMultiSamplePilot1 - Joint Estimate", spec);
}
@ -64,7 +64,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot2Joint() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
Arrays.asList("c854552bbf4a33b8dca488ea5f0a4a32"));
Arrays.asList("68d00450e3c2129ea38c67171722b385"));
executeTest("testMultiSamplePilot2 - Joint Estimate", spec);
}
@ -72,7 +72,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSingleSamplePilot2Joint() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
Arrays.asList("9dcb1d4af7c02804150a0f6c38be4e1e"));
Arrays.asList("8971ab1c9d2780e5e12e9bfc0b059cd1"));
executeTest("testSingleSamplePilot2 - Joint Estimate", spec);
}
@ -85,7 +85,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testParallelization() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,400,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30 -nt 4", 1,
Arrays.asList("35637b1ec52f2a7c0551b87795c3060c"));
Arrays.asList("fb5d09eb8f1494d48032be7272699add"));
executeTest("test parallelization", spec);
}
@ -98,11 +98,11 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testParameter() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "-genotype", "e5d4287d27aa7734f0d57a8213f549ef" );
e.put( "-all_bases", "3a291bb06764e3615230f467cc501096" );
e.put( "--min_base_quality_score 26", "fb00499f249973c156ca64e30e7b2d91" );
e.put( "--min_mapping_quality_score 26", "315c7951a783655445933ed9d83db2c2" );
e.put( "--max_mismatches_in_40bp_window 5", "bca61f8afce9f64763a1225e9c614d38" );
e.put( "-genotype", "41af43f6eaab72de553d865a3089bf54" );
e.put( "-all_bases", "7cc1609aef6d6cc3dd7822c52c403750" );
e.put( "--min_base_quality_score 26", "9596e2102369ced181f2a87d686faf2e" );
e.put( "--min_mapping_quality_score 26", "130efb2b8bd7b495bf65c6477bcf83c8" );
e.put( "--max_mismatches_in_40bp_window 5", "18935308954cf390b628c9226eccbe94" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@ -116,7 +116,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testConfidence() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -bm empirical -gm JOINT_ESTIMATE -confidence 10 ", 1,
Arrays.asList("c7427818f57cc3bf11e9ee98461c1a65"));
Arrays.asList("8c0e1fed37a2eac9eaaaa59e31350f43"));
executeTest("testConfidence", spec);
}