- Removed max_coverage argument from UG; Aaron will set it up so that we don't call when the GATK had to drop reads.
- Reimplemented optimization in UG to not call when there are no non-ref bases. - Compute reference confidence accurately in UG for ref calls. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2693 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
2c8d7b0c44
commit
47440bc029
|
|
@ -48,17 +48,17 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
|
|||
|
||||
// if there are no non-ref bases...
|
||||
if ( bestAlternateAllele == null ) {
|
||||
// if we don't want all bases, then we can just return
|
||||
|
||||
// todo -- we still need to calculate the confidence in the reference base.
|
||||
// todo -- we can still include this optimization, but we should calculate a confidence score
|
||||
// if ( !ALL_BASE_MODE && !GENOTYPE_MODE )
|
||||
// return new VariantCallContext(false);
|
||||
|
||||
// if we don't want all bases, then we don't need to calculate genotype likelihoods
|
||||
if ( !ALL_BASE_MODE && !GENOTYPE_MODE ) {
|
||||
VariantCallContext vcc = new VariantCallContext(false);
|
||||
estimateReferenceConfidence(vcc, contexts, DiploidGenotypePriors.HUMAN_HETEROZYGOSITY, false);
|
||||
return vcc;
|
||||
}
|
||||
// otherwise, choose any alternate allele (it doesn't really matter)
|
||||
bestAlternateAllele = (ref != 'A' ? 'A' : 'C');
|
||||
}
|
||||
|
||||
// calculate likelihoods if there are non-ref bases
|
||||
initializeAlleleFrequencies(frequencyEstimationPoints);
|
||||
|
||||
initialize(ref, contexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
|
||||
|
|
@ -69,8 +69,14 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
|
|||
if ( verboseWriter != null )
|
||||
printAlleleFrequencyData(ref, loc, frequencyEstimationPoints);
|
||||
|
||||
return createCalls(tracker, ref, contexts, loc, frequencyEstimationPoints);
|
||||
}
|
||||
VariantCallContext vcc = createCalls(tracker, ref, contexts, loc, frequencyEstimationPoints);
|
||||
|
||||
// technically, at this point our confidence in a reference call isn't accurately
|
||||
// estimated because it didn't take into account samples with no data
|
||||
if ( vcc.variation == null )
|
||||
estimateReferenceConfidence(vcc, contexts, DiploidGenotypePriors.HUMAN_HETEROZYGOSITY, true);
|
||||
return vcc;
|
||||
}
|
||||
|
||||
protected int getNSamples(Map<String, StratifiedAlignmentContext> contexts) {
|
||||
return contexts.size();
|
||||
|
|
@ -151,6 +157,27 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
|
|||
return AFs;
|
||||
}
|
||||
|
||||
private void estimateReferenceConfidence(VariantCallContext vcc, Map<String, StratifiedAlignmentContext> contexts, double theta, boolean ignoreCoveredSamples) {
|
||||
|
||||
double P_of_ref = 1.0;
|
||||
|
||||
// use the AF=0 prob if it's calculated
|
||||
if ( ignoreCoveredSamples )
|
||||
P_of_ref = 1.0 - PofFs[BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele)];
|
||||
|
||||
// for each sample that we haven't examined yet
|
||||
for ( String sample : samples ) {
|
||||
boolean isCovered = contexts.containsKey(sample);
|
||||
if ( ignoreCoveredSamples && isCovered )
|
||||
continue;
|
||||
|
||||
int depth = isCovered ? contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup().size() : 0;
|
||||
P_of_ref *= 1.0 - (theta / 2.0) * MathUtils.binomialProbability(0, depth, 0.5);
|
||||
}
|
||||
|
||||
vcc.confidentlyCalled = QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= CONFIDENCE_THRESHOLD;
|
||||
}
|
||||
|
||||
protected void calculateAlleleFrequencyPosteriors(char ref, int frequencyEstimationPoints, Map<String, StratifiedAlignmentContext> contexts, StratifiedAlignmentContext.StratifiedContextType contextType) {
|
||||
|
||||
// initialization
|
||||
|
|
|
|||
|
|
@ -83,7 +83,4 @@ public class UnifiedArgumentCollection {
|
|||
|
||||
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
|
||||
public Double MAX_DELETION_FRACTION = 0.05;
|
||||
|
||||
@Argument(fullName = "max_coverage", shortName = "mc", doc = "Maximum reads at this locus (after filtering bad bases/reads) for it to be callable; to disable, provide value < 1 [default:10,000]", required = false)
|
||||
public Integer MAX_READS_IN_PILEUP = 100000;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -260,15 +260,19 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
if ( !BaseUtils.isRegularBase(ref) )
|
||||
return null;
|
||||
|
||||
ReadBackedPileup rawPileup = rawContext.getBasePileup();
|
||||
// don't try to call if we couldn't read in all reads at this locus (since it wasn't properly downsampled)
|
||||
if ( rawPileup.size() == getToolkit().getArguments().readMaxPileup )
|
||||
return null;
|
||||
|
||||
// filter the context based on min base and mapping qualities
|
||||
ReadBackedPileup pileup = rawContext.getBasePileup().getBaseFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE);
|
||||
ReadBackedPileup pileup = rawPileup.getBaseAndMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE, UAC.MIN_MAPPING_QUALTY_SCORE);
|
||||
|
||||
// filter the context based on mapping quality and mismatch rate
|
||||
pileup = filterPileup(pileup, refContext, UAC);
|
||||
|
||||
// an optimization to speed things up when there is no coverage or when overly covered
|
||||
if ( pileup.size() == 0 ||
|
||||
(UAC.MAX_READS_IN_PILEUP > 0 && pileup.size() > UAC.MAX_READS_IN_PILEUP) )
|
||||
// don't call when there is no coverage
|
||||
if ( pileup.size() == 0 )
|
||||
return null;
|
||||
|
||||
// are there too many deletions in the pileup?
|
||||
|
|
@ -306,8 +310,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
|
||||
ArrayList<PileupElement> filteredPileup = new ArrayList<PileupElement>();
|
||||
for ( PileupElement p : pileup ) {
|
||||
if ( p.getMappingQual() >= UAC.MIN_MAPPING_QUALTY_SCORE &&
|
||||
(UAC.USE_BADLY_MATED_READS || !p.getRead().getReadPairedFlag() || p.getRead().getMateUnmappedFlag() || p.getRead().getMateReferenceIndex() == p.getRead().getReferenceIndex()) &&
|
||||
if ( (UAC.USE_BADLY_MATED_READS || !p.getRead().getReadPairedFlag() || p.getRead().getMateUnmappedFlag() || p.getRead().getMateReferenceIndex() == p.getRead().getReferenceIndex()) &&
|
||||
AlignmentUtils.mismatchesInRefWindow(p, refContext, true) <= UAC.MAX_MISMATCHES )
|
||||
filteredPileup.add(p);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSamplePilot1PointEM() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("46232790dae2e99e79626de78836ba08"));
|
||||
Arrays.asList("5ad3f97c886a3381821366caa9162c12"));
|
||||
executeTest("testMultiSamplePilot1 - Point Estimate EM", spec);
|
||||
}
|
||||
|
||||
|
|
@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSamplePilot2PointEM() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("683cfa48cc5e2d140286645873184c20"));
|
||||
Arrays.asList("73c80566c8353958b2ac61932f0b3812"));
|
||||
executeTest("testMultiSamplePilot2 - Point Estimate EM", spec);
|
||||
}
|
||||
|
||||
|
|
@ -43,7 +43,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testPooled1() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,000-10,024,000 -bm empirical -gm POOLED -ps 60 -confidence 30", 1,
|
||||
Arrays.asList("f2b3799fe18010aa8cfd76b0e0782db7"));
|
||||
Arrays.asList("5a86c53e8f0897a71ff74662c5696dc4"));
|
||||
executeTest("testPooled1", spec);
|
||||
}
|
||||
|
||||
|
|
@ -56,7 +56,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSamplePilot1Joint() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,022,000-10,025,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("42fc8d585f2f30dc6c58d413738e1f7c"));
|
||||
Arrays.asList("5e0a92fbddeb9d6e35586d0488a1e5c7"));
|
||||
executeTest("testMultiSamplePilot1 - Joint Estimate", spec);
|
||||
}
|
||||
|
||||
|
|
@ -64,7 +64,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSamplePilot2Joint() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("c854552bbf4a33b8dca488ea5f0a4a32"));
|
||||
Arrays.asList("68d00450e3c2129ea38c67171722b385"));
|
||||
executeTest("testMultiSamplePilot2 - Joint Estimate", spec);
|
||||
}
|
||||
|
||||
|
|
@ -72,7 +72,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testSingleSamplePilot2Joint() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("9dcb1d4af7c02804150a0f6c38be4e1e"));
|
||||
Arrays.asList("8971ab1c9d2780e5e12e9bfc0b059cd1"));
|
||||
executeTest("testSingleSamplePilot2 - Joint Estimate", spec);
|
||||
}
|
||||
|
||||
|
|
@ -85,7 +85,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testParallelization() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,400,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30 -nt 4", 1,
|
||||
Arrays.asList("35637b1ec52f2a7c0551b87795c3060c"));
|
||||
Arrays.asList("fb5d09eb8f1494d48032be7272699add"));
|
||||
executeTest("test parallelization", spec);
|
||||
}
|
||||
|
||||
|
|
@ -98,11 +98,11 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testParameter() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "-genotype", "e5d4287d27aa7734f0d57a8213f549ef" );
|
||||
e.put( "-all_bases", "3a291bb06764e3615230f467cc501096" );
|
||||
e.put( "--min_base_quality_score 26", "fb00499f249973c156ca64e30e7b2d91" );
|
||||
e.put( "--min_mapping_quality_score 26", "315c7951a783655445933ed9d83db2c2" );
|
||||
e.put( "--max_mismatches_in_40bp_window 5", "bca61f8afce9f64763a1225e9c614d38" );
|
||||
e.put( "-genotype", "41af43f6eaab72de553d865a3089bf54" );
|
||||
e.put( "-all_bases", "7cc1609aef6d6cc3dd7822c52c403750" );
|
||||
e.put( "--min_base_quality_score 26", "9596e2102369ced181f2a87d686faf2e" );
|
||||
e.put( "--min_mapping_quality_score 26", "130efb2b8bd7b495bf65c6477bcf83c8" );
|
||||
e.put( "--max_mismatches_in_40bp_window 5", "18935308954cf390b628c9226eccbe94" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
|
|
@ -116,7 +116,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testConfidence() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -bm empirical -gm JOINT_ESTIMATE -confidence 10 ", 1,
|
||||
Arrays.asList("c7427818f57cc3bf11e9ee98461c1a65"));
|
||||
Arrays.asList("8c0e1fed37a2eac9eaaaa59e31350f43"));
|
||||
executeTest("testConfidence", spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue