Improvements and bug fixes galore. (1) Now properly handles Q0 bases, filtering them out, you can disable this if you need to (2) support for three-state base probabilities (see email), which is disabled by default (still experimental) but appears to be more emppowered to detect variants (see email too)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1327 348d0f76-0448-11de-a6fe-93d51630548a
2009-07-28 13:21:38 +00:00 · 2009-07-28 13:21:38 +00:00 · df4fd498c5
parent 46643d3724
commit df4fd498c5
1 changed files with 25 additions and 7 deletions
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java
@ -38,6 +38,9 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
    @Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confidient genotypes or just the variants?", required = false)
    public boolean GENOTYPE = false;

+    @Argument(fullName = "3BaseErrors", shortName = "3BaseErrors", doc = "Should we use a 3-base error mode (so that P(b_true != b_called | e) == e / 3?", required = false)
+    public boolean THREE_BASE_ERRORS = false;
+
    // Control periodic reporting features
    @Argument(fullName = "metrics_interval", shortName = "metint", doc = "Number of loci to process between metrics reports", required = false) public Integer METRICS_INTERVAL = 50000;
    @Argument(fullName = "suppress_metrics", shortName = "printmets", doc = "If specified, don't display metrics", required = false) public Boolean SUPPRESS_METRICS = false;
@ -47,7 +50,7 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
    @Argument(fullName = "call_indels", shortName = "indels", doc = "Call indels as well as point mutations", required = false) public Boolean CALL_INDELS = false;

    // Control how the genotype hypotheses are weighed
-    @Argument(fullName = "priors_any_locus", shortName = "plocus", doc = "Comma-separated prior likelihoods for any locus (homref,het,homvar)", required = false) public String PRIORS_ANY_LOCUS = "0.999,1e-3,1e-5";
+    @Argument(fullName = "priors_locus", shortName = "plocus", doc = "Comma-separated prior likelihoods for any locus (homref,het,homvar)", required = false) public String PRIORS_ANY_LOCUS = "0.999,1e-3,1e-5";
    @Argument(fullName = "priors_hapmap", shortName = "phapmap", doc = "Comma-separated prior likelihoods for Hapmap loci (homref,het,homvar)", required = false) public String PRIORS_HAPMAP = "0.999,1e-3,1e-5";
    @Argument(fullName = "priors_dbsnp", shortName = "pdbsnp", doc = "Comma-separated prior likelihoods for dbSNP loci (homref,het,homvar)", required = false) public String PRIORS_DBSNP = "0.999,1e-3,1e-5";
    @Argument(fullName = "priors_2nd_on", shortName = "p2ndon", doc = "Comma-separated prior likelihoods for the secondary bases of primary on-genotype bases (AA,AC,AG,AT,CC,CG,CT,GG,GT,TT)", required = false) public String PRIORS_2ND_ON = "0.000,0.302,0.366,0.142,0.000,0.548,0.370,0.000,0.319,0.000";
@ -56,6 +59,9 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
    // Control various sample-level settings
    @Argument(fullName = "sample_name_regex", shortName = "sample_name_regex", doc = "Replaces the sample name specified in the BAM read group with the value supplied here", required = false) public String SAMPLE_NAME_REGEX = null;

+    @Argument(fullName = "keepQ0Bases", shortName = "keepQ0Bases", doc = "If true, then Q0 bases will be included in the genotyping calculation, and treated as Q1 -- this is really not a good idea", required = false)
+    public boolean keepQ0Bases = false;
+
    public AlleleMetrics metricsOut;
    public PrintStream variantsOut;
    public String sampleName;
@ -67,6 +73,8 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
    public double[] p2ndon;
    public double[] p2ndoff;

+    public long nFilteredQ0Bases = 0;
+
    /**
     * Specify that this walker requires reads.
     *
@ -225,21 +233,30 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
     * @return the likelihoods per genotype
     */
    private GenotypeLikelihoods callGenotype(RefMetaDataTracker tracker, char ref, ReadBackedPileup pileup, List<SAMRecord> reads, List<Integer> offsets) {
-        GenotypeLikelihoods G;
+        GenotypeLikelihoods G = null;

        if (isHapmapSite(tracker)) {
-            G = new GenotypeLikelihoods(phapmap[0], phapmap[1], phapmap[2], p2ndon, p2ndoff);
+            G = new GenotypeLikelihoods(THREE_BASE_ERRORS, phapmap[0], phapmap[1], phapmap[2], p2ndon, p2ndoff);
        } else if (isDbSNPSite(tracker)) {
-            G = new GenotypeLikelihoods(pdbsnp[0], pdbsnp[1], pdbsnp[2], p2ndon, p2ndoff);
+            G = new GenotypeLikelihoods(THREE_BASE_ERRORS, pdbsnp[0], pdbsnp[1], pdbsnp[2], p2ndon, p2ndoff);
        } else {
-            G = new GenotypeLikelihoods(plocus[0], plocus[1], plocus[2], p2ndon, p2ndoff);
+            G = new GenotypeLikelihoods(THREE_BASE_ERRORS, plocus[0], plocus[1], plocus[2], p2ndon, p2ndoff);
        }

+        G.filterQ0Bases(! keepQ0Bases); // Set the filtering / keeping flag
+        
        for (int i = 0; i < reads.size(); i++) {
            SAMRecord read = reads.get(i);
            int offset = offsets.get(i);

-            G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset]);
+            char base = read.getReadString().charAt(offset);
+            byte qual = read.getBaseQualities()[offset];
+            int nBasesAdded = G.add(ref, base, qual);
+            if ( nBasesAdded == 0 ) {
+                nFilteredQ0Bases++;
+                //System.out.printf("Filtering Q0 base from %s at %s %d: %c %c %d%n",
+                //        read.getReadName(), GenomeLocParser.createGenomeLoc(read), offset, ref, base, qual);
+            }
        }

        G.ApplyPrior(ref, this.alt_allele, this.allele_frequency_prior);
@ -352,7 +369,8 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
    }

    /** Close the variant file. */
-    public void onTraversalDone(String sum) { 
+    public void onTraversalDone(String sum) {
+        logger.info(String.format("SingleSampleGenotyper filtered %d Q0 bases", nFilteredQ0Bases));
        if (this.VAR_FORMAT == GenotypeWriterFactory.GENOTYPE_FORMAT.GLF) {
            mGenotypeWriter.close();
        } else {