Merge pull request #93 from broadinstitute/gda_ancient_dna

Two features useful for ancient DNA processing. Ancient DNA sequencing d...
2013-03-10 17:57:28 -07:00 · 2013-03-10 17:57:28 -07:00 · 508b58376c
parent 21a6b4add2 695723ba43
commit 508b58376c
6 changed files with 75 additions and 9 deletions
--- a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
@ -112,6 +112,15 @@ public class StandardCallerArgumentCollection {
    @Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
    public int MAX_ALTERNATE_ALLELES = 6;

+    /**
+     * By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus.
+     * If This argument is true, the heterozygosity prior will not be used - main application is for population studies where prior might not be appropriate,
+     * as for example when the ancestral status of the reference allele is not known.
+     */
+    @Advanced
+    @Argument(fullName = "dont_use_site_prior", shortName = "noPrior", doc = "If true, skip prior for variant discovery", required = false)
+    public boolean ignoreHeterozygosityPrior = false;
+
    /**
     * If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads.
     * Basically, it will ignore the contamination fraction of reads for each alternate allele.  So if the pileup contains N total bases, then we
@ -180,5 +189,6 @@ public class StandardCallerArgumentCollection {
        this.exactCallsLog = SCAC.exactCallsLog;
        this.sampleContamination=SCAC.sampleContamination;
        this.AFmodel = SCAC.AFmodel;
+        this.ignoreHeterozygosityPrior = SCAC.ignoreHeterozygosityPrior;
    }
 }
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
@ -159,8 +159,8 @@ public class UnifiedGenotyperEngine {
        this.N = samples.size() * ploidy;
        log10AlleleFrequencyPriorsSNPs = new double[N+1];
        log10AlleleFrequencyPriorsIndels = new double[N+1];
-        computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity);
-        computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY);
+        computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity, UAC.ignoreHeterozygosityPrior);
+        computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.ignoreHeterozygosityPrior);

        filter.add(LOW_QUAL_FILTER_NAME);

@ -722,8 +722,20 @@ public class UnifiedGenotyperEngine {
        return GGAmodel;
    }

-    public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) {
+    /**
+     * Function that fills vector with allele frequency priors. By default, infinite-sites, neutral variation prior is used,
+     * where Pr(AC=i) = theta/i where theta is heterozygosity
+     * @param N                                Number of chromosomes
+     * @param priors                           (output) array to be filled with priors
+     * @param theta                            Heterozygosity
+     * @param ignorePriors                     If true, priors are ignored and zeros returned
+     */
+    public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta, final boolean ignorePriors) {

+        if (ignorePriors) {
+            Arrays.fill(priors, 0,N,0.0);
+            return;
+        }
        double sum = 0.0;

        // for each i
@ -733,6 +745,10 @@ public class UnifiedGenotyperEngine {
            sum += value;
        }

+        // protection against the case of heterozygosity too high or an excessive number of samples (which break population genetics assumptions)
+        if (sum > 1.0) {
+            throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed - try reducing heterozygosity value or using the -noPrior argument");
+        }
        // null frequency for AF=0 is (1 - sum(all other frequencies))
        priors[0] = Math.log10(1.0 - sum);
    }
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java
@ -111,7 +111,7 @@ public class AFCalcTestBuilder {
                return MathUtils.normalizeFromLog10(new double[nPriorValues], true);  // flat priors
            case human:
                final double[] humanPriors = new double[nPriorValues];
-                UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001);
+                UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, false);
                return humanPriors;
            default:
                throw new RuntimeException("Unexpected type " + priorType);
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
@ -139,6 +139,15 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
        executeTest("test confidence 1", spec1);
    }

+    @Test
+    public void testNoPrior() {
+        WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
+                baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -noPrior", 1,
+                Arrays.asList("422656266117f8d01e17e5c491c49a24"));
+        executeTest("test no prior 1", spec1);
+
+    }
+
    // --------------------------------------------------------------------------------------------------------------
    //
    // testing heterozygosity
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java
@ -176,7 +176,7 @@ public class AFCalcUnitTest extends BaseTest {
            final int nPriorValues = 2*nSamples+1;
            final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true);  // flat priors
            final double[] humanPriors = new double[nPriorValues];
-            UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001);
+            UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, false);

            for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) {
                for ( AFCalc model : calcs ) {
@ -575,6 +575,35 @@ public class AFCalcUnitTest extends BaseTest {
        return tests.toArray(new Object[][]{});
    }

+
+    @Test(enabled = true, dataProvider =  "Models")
+    public void testNoPrior(final AFCalc model) {
+        for ( int REF_PL = 10; REF_PL <= 20; REF_PL += 10 ) {
+            final Genotype AB = makePL(Arrays.asList(A,C), REF_PL, 0, 10000);
+
+            final double[] flatPriors = new double[]{0.0,0.0,0.0};
+            final double[] noPriors = new double[3];
+            // test that function computeAlleleFrequency correctly operates when the -noPrior option is set
+            UnifiedGenotyperEngine.computeAlleleFrequencyPriors(2, noPriors, 0.001, true);
+
+            GetGLsTest cfgFlatPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "flatPrior");
+            GetGLsTest cfgNoPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "noPrior");
+            final AFCalcResult resultTrackerFlat = cfgFlatPrior.execute();
+            final AFCalcResult resultTrackerNoPrior = cfgNoPrior.execute();
+
+            final double pRefWithNoPrior = AB.getLikelihoods().getAsVector()[0];
+            final double pHetWithNoPrior = AB.getLikelihoods().getAsVector()[1]  - Math.log10(0.5);
+            final double nonRefPost = Math.pow(10, pHetWithNoPrior) / (Math.pow(10, pRefWithNoPrior) + Math.pow(10, pHetWithNoPrior));
+            final double log10NonRefPost = Math.log10(nonRefPost);
+
+            if ( ! Double.isInfinite(log10NonRefPost) ) {
+                // check that the no-prior and flat-prior constructions yield same result
+                Assert.assertEquals(resultTrackerFlat.getLog10PosteriorOfAFGT0(), resultTrackerNoPrior.getLog10PosteriorOfAFGT0());
+            }
+
+        }
+    }
+
    @Test(enabled = true && !DEBUG_ONLY, dataProvider = "Models")
    public void testBiallelicPriors(final AFCalc model) {

--- a/public/java/src/org/broadinstitute/sting/gatk/filters/MaxReadLengthFilter.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/filters/MaxReadLengthFilter.java
@ -29,18 +29,20 @@ import net.sf.samtools.SAMRecord;
 import org.broadinstitute.sting.commandline.Argument;

 /**
- * Filters out reads whose length is >= some value.
+ * Filters out reads whose length is >= some value or < some value.
 *
 * @author mhanna
 * @version 0.1
 */
-public class MaxReadLengthFilter extends ReadFilter {
+public class ReadLengthFilter extends ReadFilter {
    @Argument(fullName = "maxReadLength", shortName = "maxRead", doc="Discard reads with length greater than the specified value", required=true)
    private int maxReadLength;
-    
+
+    @Argument(fullName = "minReadLength", shortName = "minRead", doc="Discard reads with length shorter than the specified value", required=true)
+    private int minReadLength = 1;
    public boolean filterOut(SAMRecord read) {
        // check the length
-        return read.getReadLength() > maxReadLength;
+        return read.getReadLength() > maxReadLength || read.getReadLength() < minReadLength;
    }

 }