Two features useful for ancient DNA processing.

Ancient DNA sequencing data is in many ways different from modern data, and methods to analyze it need to be adapted accordingly. Feature 1: Read adaptor trimming. Ancient DNA libraries typically have very short inserts (in the order of 50 bp), so typical Illumina libraries sequenced in, say, 100bp HiSeq will have a large adaptor component being read after the insert. If this adaptor is not removed, data will not be aligneable. There are third party tools that remove adaptor and potentially merge read pairs, but are cumbersome to use and require precise knowledge of the library construction and adaptor sequence. -- New walker ReadAdaptorTrimmer walks through paired end data, computes pair overlap and trims auto-detected adaptor sequence. -- Unit tests added for trimming operation. -- Utility walker (may be retired later) DetailedReadLengthDistribution computes insert size or read length distribution stratified by read group and mapping status and outputs a GATKReport with data. -- Renamed MaxReadLengthFilter to ReadLengthFilter and added ability to specify minimum read length as a filter (may be useful if, as a consequence of adaptor trimming, we're left with a lot of very short reads which will map poorly and will just clutter output BAMs). Feature 2: Unbiased site QUAL estimation: many times ancestral allele status is not known and VCF fields like QUAL, QD, GQ, etc. are affected by the pop. gen. prior at a site. This might introduce subtle biases in studies where a species is aligned against the reference of another species, so an option for UG and HC not to apply such prior is introduced. -- Added -noPrior argument to StandardCallerArgumentCollection. -- Added option not to fill priors is such argument is set. -- Added an integration test.
2013-02-18 20:00:46 -05:00 · 2013-02-18 20:00:46 -05:00 · 695723ba43
parent 21a6b4add2
commit 695723ba43
6 changed files with 75 additions and 9 deletions
--- a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
@ -112,6 +112,15 @@ public class StandardCallerArgumentCollection {
    @Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
    public int MAX_ALTERNATE_ALLELES = 6;

+    /**
+     * By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus.
+     * If This argument is true, the heterozygosity prior will not be used - main application is for population studies where prior might not be appropriate,
+     * as for example when the ancestral status of the reference allele is not known.
+     */
+    @Advanced
+    @Argument(fullName = "dont_use_site_prior", shortName = "noPrior", doc = "If true, skip prior for variant discovery", required = false)
+    public boolean ignoreHeterozygosityPrior = false;
+
    /**
     * If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads.
     * Basically, it will ignore the contamination fraction of reads for each alternate allele.  So if the pileup contains N total bases, then we
@ -180,5 +189,6 @@ public class StandardCallerArgumentCollection {
        this.exactCallsLog = SCAC.exactCallsLog;
        this.sampleContamination=SCAC.sampleContamination;
        this.AFmodel = SCAC.AFmodel;
+        this.ignoreHeterozygosityPrior = SCAC.ignoreHeterozygosityPrior;
    }
 }
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
@ -159,8 +159,8 @@ public class UnifiedGenotyperEngine {
        this.N = samples.size() * ploidy;
        log10AlleleFrequencyPriorsSNPs = new double[N+1];
        log10AlleleFrequencyPriorsIndels = new double[N+1];
-        computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity);
-        computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY);
+        computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity, UAC.ignoreHeterozygosityPrior);
+        computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.ignoreHeterozygosityPrior);

        filter.add(LOW_QUAL_FILTER_NAME);

@ -722,8 +722,20 @@ public class UnifiedGenotyperEngine {
        return GGAmodel;
    }

-    public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) {
+    /**
+     * Function that fills vector with allele frequency priors. By default, infinite-sites, neutral variation prior is used,
+     * where Pr(AC=i) = theta/i where theta is heterozygosity
+     * @param N                                Number of chromosomes
+     * @param priors                           (output) array to be filled with priors
+     * @param theta                            Heterozygosity
+     * @param ignorePriors                     If true, priors are ignored and zeros returned
+     */
+    public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta, final boolean ignorePriors) {

+        if (ignorePriors) {
+            Arrays.fill(priors, 0,N,0.0);
+            return;
+        }
        double sum = 0.0;

        // for each i
@ -733,6 +745,10 @@ public class UnifiedGenotyperEngine {
            sum += value;
        }

+        // protection against the case of heterozygosity too high or an excessive number of samples (which break population genetics assumptions)
+        if (sum > 1.0) {
+            throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed - try reducing heterozygosity value or using the -noPrior argument");
+        }
        // null frequency for AF=0 is (1 - sum(all other frequencies))
        priors[0] = Math.log10(1.0 - sum);
    }
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java
@ -111,7 +111,7 @@ public class AFCalcTestBuilder {
                return MathUtils.normalizeFromLog10(new double[nPriorValues], true);  // flat priors
            case human:
                final double[] humanPriors = new double[nPriorValues];
-                UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001);
+                UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, false);
                return humanPriors;
            default:
                throw new RuntimeException("Unexpected type " + priorType);
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
@ -139,6 +139,15 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
        executeTest("test confidence 1", spec1);
    }

+    @Test
+    public void testNoPrior() {
+        WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
+                baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -noPrior", 1,
+                Arrays.asList("422656266117f8d01e17e5c491c49a24"));
+        executeTest("test no prior 1", spec1);
+
+    }
+
    // --------------------------------------------------------------------------------------------------------------
    //
    // testing heterozygosity
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java
@ -176,7 +176,7 @@ public class AFCalcUnitTest extends BaseTest {
            final int nPriorValues = 2*nSamples+1;
            final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true);  // flat priors
            final double[] humanPriors = new double[nPriorValues];
-            UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001);
+            UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, false);

            for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) {
                for ( AFCalc model : calcs ) {
@ -575,6 +575,35 @@ public class AFCalcUnitTest extends BaseTest {
        return tests.toArray(new Object[][]{});
    }

+
+    @Test(enabled = true, dataProvider =  "Models")
+    public void testNoPrior(final AFCalc model) {
+        for ( int REF_PL = 10; REF_PL <= 20; REF_PL += 10 ) {
+            final Genotype AB = makePL(Arrays.asList(A,C), REF_PL, 0, 10000);
+
+            final double[] flatPriors = new double[]{0.0,0.0,0.0};
+            final double[] noPriors = new double[3];
+            // test that function computeAlleleFrequency correctly operates when the -noPrior option is set
+            UnifiedGenotyperEngine.computeAlleleFrequencyPriors(2, noPriors, 0.001, true);
+
+            GetGLsTest cfgFlatPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "flatPrior");
+            GetGLsTest cfgNoPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "noPrior");
+            final AFCalcResult resultTrackerFlat = cfgFlatPrior.execute();
+            final AFCalcResult resultTrackerNoPrior = cfgNoPrior.execute();
+
+            final double pRefWithNoPrior = AB.getLikelihoods().getAsVector()[0];
+            final double pHetWithNoPrior = AB.getLikelihoods().getAsVector()[1]  - Math.log10(0.5);
+            final double nonRefPost = Math.pow(10, pHetWithNoPrior) / (Math.pow(10, pRefWithNoPrior) + Math.pow(10, pHetWithNoPrior));
+            final double log10NonRefPost = Math.log10(nonRefPost);
+
+            if ( ! Double.isInfinite(log10NonRefPost) ) {
+                // check that the no-prior and flat-prior constructions yield same result
+                Assert.assertEquals(resultTrackerFlat.getLog10PosteriorOfAFGT0(), resultTrackerNoPrior.getLog10PosteriorOfAFGT0());
+            }
+
+        }
+    }
+
    @Test(enabled = true && !DEBUG_ONLY, dataProvider = "Models")
    public void testBiallelicPriors(final AFCalc model) {

--- a/public/java/src/org/broadinstitute/sting/gatk/filters/MaxReadLengthFilter.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/filters/MaxReadLengthFilter.java
@ -29,18 +29,20 @@ import net.sf.samtools.SAMRecord;
 import org.broadinstitute.sting.commandline.Argument;

 /**
- * Filters out reads whose length is >= some value.
+ * Filters out reads whose length is >= some value or < some value.
 *
 * @author mhanna
 * @version 0.1
 */
-public class MaxReadLengthFilter extends ReadFilter {
+public class ReadLengthFilter extends ReadFilter {
    @Argument(fullName = "maxReadLength", shortName = "maxRead", doc="Discard reads with length greater than the specified value", required=true)
    private int maxReadLength;
-    
+
+    @Argument(fullName = "minReadLength", shortName = "minRead", doc="Discard reads with length shorter than the specified value", required=true)
+    private int minReadLength = 1;
    public boolean filterOut(SAMRecord read) {
        // check the length
-        return read.getReadLength() > maxReadLength;
+        return read.getReadLength() > maxReadLength || read.getReadLength() < minReadLength;
    }

 }