- Removed max_coverage argument from UG; Aaron will set it up so that we don't call when the GATK had to drop reads.
- Reimplemented optimization in UG to not call when there are no non-ref bases. - Compute reference confidence accurately in UG for ref calls. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2693 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
2c8d7b0c44
commit
47440bc029
|
|
@ -48,17 +48,17 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
|
||||||
|
|
||||||
// if there are no non-ref bases...
|
// if there are no non-ref bases...
|
||||||
if ( bestAlternateAllele == null ) {
|
if ( bestAlternateAllele == null ) {
|
||||||
// if we don't want all bases, then we can just return
|
// if we don't want all bases, then we don't need to calculate genotype likelihoods
|
||||||
|
if ( !ALL_BASE_MODE && !GENOTYPE_MODE ) {
|
||||||
// todo -- we still need to calculate the confidence in the reference base.
|
VariantCallContext vcc = new VariantCallContext(false);
|
||||||
// todo -- we can still include this optimization, but we should calculate a confidence score
|
estimateReferenceConfidence(vcc, contexts, DiploidGenotypePriors.HUMAN_HETEROZYGOSITY, false);
|
||||||
// if ( !ALL_BASE_MODE && !GENOTYPE_MODE )
|
return vcc;
|
||||||
// return new VariantCallContext(false);
|
}
|
||||||
|
|
||||||
// otherwise, choose any alternate allele (it doesn't really matter)
|
// otherwise, choose any alternate allele (it doesn't really matter)
|
||||||
bestAlternateAllele = (ref != 'A' ? 'A' : 'C');
|
bestAlternateAllele = (ref != 'A' ? 'A' : 'C');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// calculate likelihoods if there are non-ref bases
|
||||||
initializeAlleleFrequencies(frequencyEstimationPoints);
|
initializeAlleleFrequencies(frequencyEstimationPoints);
|
||||||
|
|
||||||
initialize(ref, contexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
|
initialize(ref, contexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
|
||||||
|
|
@ -69,8 +69,14 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
|
||||||
if ( verboseWriter != null )
|
if ( verboseWriter != null )
|
||||||
printAlleleFrequencyData(ref, loc, frequencyEstimationPoints);
|
printAlleleFrequencyData(ref, loc, frequencyEstimationPoints);
|
||||||
|
|
||||||
return createCalls(tracker, ref, contexts, loc, frequencyEstimationPoints);
|
VariantCallContext vcc = createCalls(tracker, ref, contexts, loc, frequencyEstimationPoints);
|
||||||
}
|
|
||||||
|
// technically, at this point our confidence in a reference call isn't accurately
|
||||||
|
// estimated because it didn't take into account samples with no data
|
||||||
|
if ( vcc.variation == null )
|
||||||
|
estimateReferenceConfidence(vcc, contexts, DiploidGenotypePriors.HUMAN_HETEROZYGOSITY, true);
|
||||||
|
return vcc;
|
||||||
|
}
|
||||||
|
|
||||||
protected int getNSamples(Map<String, StratifiedAlignmentContext> contexts) {
|
protected int getNSamples(Map<String, StratifiedAlignmentContext> contexts) {
|
||||||
return contexts.size();
|
return contexts.size();
|
||||||
|
|
@ -151,6 +157,27 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
|
||||||
return AFs;
|
return AFs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void estimateReferenceConfidence(VariantCallContext vcc, Map<String, StratifiedAlignmentContext> contexts, double theta, boolean ignoreCoveredSamples) {
|
||||||
|
|
||||||
|
double P_of_ref = 1.0;
|
||||||
|
|
||||||
|
// use the AF=0 prob if it's calculated
|
||||||
|
if ( ignoreCoveredSamples )
|
||||||
|
P_of_ref = 1.0 - PofFs[BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele)];
|
||||||
|
|
||||||
|
// for each sample that we haven't examined yet
|
||||||
|
for ( String sample : samples ) {
|
||||||
|
boolean isCovered = contexts.containsKey(sample);
|
||||||
|
if ( ignoreCoveredSamples && isCovered )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
int depth = isCovered ? contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup().size() : 0;
|
||||||
|
P_of_ref *= 1.0 - (theta / 2.0) * MathUtils.binomialProbability(0, depth, 0.5);
|
||||||
|
}
|
||||||
|
|
||||||
|
vcc.confidentlyCalled = QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= CONFIDENCE_THRESHOLD;
|
||||||
|
}
|
||||||
|
|
||||||
protected void calculateAlleleFrequencyPosteriors(char ref, int frequencyEstimationPoints, Map<String, StratifiedAlignmentContext> contexts, StratifiedAlignmentContext.StratifiedContextType contextType) {
|
protected void calculateAlleleFrequencyPosteriors(char ref, int frequencyEstimationPoints, Map<String, StratifiedAlignmentContext> contexts, StratifiedAlignmentContext.StratifiedContextType contextType) {
|
||||||
|
|
||||||
// initialization
|
// initialization
|
||||||
|
|
|
||||||
|
|
@ -83,7 +83,4 @@ public class UnifiedArgumentCollection {
|
||||||
|
|
||||||
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
|
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
|
||||||
public Double MAX_DELETION_FRACTION = 0.05;
|
public Double MAX_DELETION_FRACTION = 0.05;
|
||||||
|
|
||||||
@Argument(fullName = "max_coverage", shortName = "mc", doc = "Maximum reads at this locus (after filtering bad bases/reads) for it to be callable; to disable, provide value < 1 [default:10,000]", required = false)
|
|
||||||
public Integer MAX_READS_IN_PILEUP = 100000;
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -260,15 +260,19 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
||||||
if ( !BaseUtils.isRegularBase(ref) )
|
if ( !BaseUtils.isRegularBase(ref) )
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
|
ReadBackedPileup rawPileup = rawContext.getBasePileup();
|
||||||
|
// don't try to call if we couldn't read in all reads at this locus (since it wasn't properly downsampled)
|
||||||
|
if ( rawPileup.size() == getToolkit().getArguments().readMaxPileup )
|
||||||
|
return null;
|
||||||
|
|
||||||
// filter the context based on min base and mapping qualities
|
// filter the context based on min base and mapping qualities
|
||||||
ReadBackedPileup pileup = rawContext.getBasePileup().getBaseFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE);
|
ReadBackedPileup pileup = rawPileup.getBaseAndMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE, UAC.MIN_MAPPING_QUALTY_SCORE);
|
||||||
|
|
||||||
// filter the context based on mapping quality and mismatch rate
|
// filter the context based on mapping quality and mismatch rate
|
||||||
pileup = filterPileup(pileup, refContext, UAC);
|
pileup = filterPileup(pileup, refContext, UAC);
|
||||||
|
|
||||||
// an optimization to speed things up when there is no coverage or when overly covered
|
// don't call when there is no coverage
|
||||||
if ( pileup.size() == 0 ||
|
if ( pileup.size() == 0 )
|
||||||
(UAC.MAX_READS_IN_PILEUP > 0 && pileup.size() > UAC.MAX_READS_IN_PILEUP) )
|
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
// are there too many deletions in the pileup?
|
// are there too many deletions in the pileup?
|
||||||
|
|
@ -306,8 +310,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
||||||
|
|
||||||
ArrayList<PileupElement> filteredPileup = new ArrayList<PileupElement>();
|
ArrayList<PileupElement> filteredPileup = new ArrayList<PileupElement>();
|
||||||
for ( PileupElement p : pileup ) {
|
for ( PileupElement p : pileup ) {
|
||||||
if ( p.getMappingQual() >= UAC.MIN_MAPPING_QUALTY_SCORE &&
|
if ( (UAC.USE_BADLY_MATED_READS || !p.getRead().getReadPairedFlag() || p.getRead().getMateUnmappedFlag() || p.getRead().getMateReferenceIndex() == p.getRead().getReferenceIndex()) &&
|
||||||
(UAC.USE_BADLY_MATED_READS || !p.getRead().getReadPairedFlag() || p.getRead().getMateUnmappedFlag() || p.getRead().getMateReferenceIndex() == p.getRead().getReferenceIndex()) &&
|
|
||||||
AlignmentUtils.mismatchesInRefWindow(p, refContext, true) <= UAC.MAX_MISMATCHES )
|
AlignmentUtils.mismatchesInRefWindow(p, refContext, true) <= UAC.MAX_MISMATCHES )
|
||||||
filteredPileup.add(p);
|
filteredPileup.add(p);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -22,7 +22,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
||||||
public void testMultiSamplePilot1PointEM() {
|
public void testMultiSamplePilot1PointEM() {
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1,
|
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1,
|
||||||
Arrays.asList("46232790dae2e99e79626de78836ba08"));
|
Arrays.asList("5ad3f97c886a3381821366caa9162c12"));
|
||||||
executeTest("testMultiSamplePilot1 - Point Estimate EM", spec);
|
executeTest("testMultiSamplePilot1 - Point Estimate EM", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
||||||
public void testMultiSamplePilot2PointEM() {
|
public void testMultiSamplePilot2PointEM() {
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1,
|
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1,
|
||||||
Arrays.asList("683cfa48cc5e2d140286645873184c20"));
|
Arrays.asList("73c80566c8353958b2ac61932f0b3812"));
|
||||||
executeTest("testMultiSamplePilot2 - Point Estimate EM", spec);
|
executeTest("testMultiSamplePilot2 - Point Estimate EM", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -43,7 +43,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
||||||
public void testPooled1() {
|
public void testPooled1() {
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,000-10,024,000 -bm empirical -gm POOLED -ps 60 -confidence 30", 1,
|
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,000-10,024,000 -bm empirical -gm POOLED -ps 60 -confidence 30", 1,
|
||||||
Arrays.asList("f2b3799fe18010aa8cfd76b0e0782db7"));
|
Arrays.asList("5a86c53e8f0897a71ff74662c5696dc4"));
|
||||||
executeTest("testPooled1", spec);
|
executeTest("testPooled1", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -56,7 +56,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
||||||
public void testMultiSamplePilot1Joint() {
|
public void testMultiSamplePilot1Joint() {
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,022,000-10,025,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,022,000-10,025,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
||||||
Arrays.asList("42fc8d585f2f30dc6c58d413738e1f7c"));
|
Arrays.asList("5e0a92fbddeb9d6e35586d0488a1e5c7"));
|
||||||
executeTest("testMultiSamplePilot1 - Joint Estimate", spec);
|
executeTest("testMultiSamplePilot1 - Joint Estimate", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -64,7 +64,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
||||||
public void testMultiSamplePilot2Joint() {
|
public void testMultiSamplePilot2Joint() {
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
||||||
Arrays.asList("c854552bbf4a33b8dca488ea5f0a4a32"));
|
Arrays.asList("68d00450e3c2129ea38c67171722b385"));
|
||||||
executeTest("testMultiSamplePilot2 - Joint Estimate", spec);
|
executeTest("testMultiSamplePilot2 - Joint Estimate", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -72,7 +72,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
||||||
public void testSingleSamplePilot2Joint() {
|
public void testSingleSamplePilot2Joint() {
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
||||||
Arrays.asList("9dcb1d4af7c02804150a0f6c38be4e1e"));
|
Arrays.asList("8971ab1c9d2780e5e12e9bfc0b059cd1"));
|
||||||
executeTest("testSingleSamplePilot2 - Joint Estimate", spec);
|
executeTest("testSingleSamplePilot2 - Joint Estimate", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -85,7 +85,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
||||||
public void testParallelization() {
|
public void testParallelization() {
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,400,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30 -nt 4", 1,
|
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,400,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30 -nt 4", 1,
|
||||||
Arrays.asList("35637b1ec52f2a7c0551b87795c3060c"));
|
Arrays.asList("fb5d09eb8f1494d48032be7272699add"));
|
||||||
executeTest("test parallelization", spec);
|
executeTest("test parallelization", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -98,11 +98,11 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testParameter() {
|
public void testParameter() {
|
||||||
HashMap<String, String> e = new HashMap<String, String>();
|
HashMap<String, String> e = new HashMap<String, String>();
|
||||||
e.put( "-genotype", "e5d4287d27aa7734f0d57a8213f549ef" );
|
e.put( "-genotype", "41af43f6eaab72de553d865a3089bf54" );
|
||||||
e.put( "-all_bases", "3a291bb06764e3615230f467cc501096" );
|
e.put( "-all_bases", "7cc1609aef6d6cc3dd7822c52c403750" );
|
||||||
e.put( "--min_base_quality_score 26", "fb00499f249973c156ca64e30e7b2d91" );
|
e.put( "--min_base_quality_score 26", "9596e2102369ced181f2a87d686faf2e" );
|
||||||
e.put( "--min_mapping_quality_score 26", "315c7951a783655445933ed9d83db2c2" );
|
e.put( "--min_mapping_quality_score 26", "130efb2b8bd7b495bf65c6477bcf83c8" );
|
||||||
e.put( "--max_mismatches_in_40bp_window 5", "bca61f8afce9f64763a1225e9c614d38" );
|
e.put( "--max_mismatches_in_40bp_window 5", "18935308954cf390b628c9226eccbe94" );
|
||||||
|
|
||||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
|
|
@ -116,7 +116,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
||||||
public void testConfidence() {
|
public void testConfidence() {
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -bm empirical -gm JOINT_ESTIMATE -confidence 10 ", 1,
|
"-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -bm empirical -gm JOINT_ESTIMATE -confidence 10 ", 1,
|
||||||
Arrays.asList("c7427818f57cc3bf11e9ee98461c1a65"));
|
Arrays.asList("8c0e1fed37a2eac9eaaaa59e31350f43"));
|
||||||
executeTest("testConfidence", spec);
|
executeTest("testConfidence", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue