Merge pull request #93 from broadinstitute/gda_ancient_dna

Two features useful for ancient DNA processing. Ancient DNA sequencing d...
This commit is contained in:
Eric Banks 2013-03-10 17:57:28 -07:00
commit 508b58376c
6 changed files with 75 additions and 9 deletions

View File

@ -112,6 +112,15 @@ public class StandardCallerArgumentCollection {
@Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
public int MAX_ALTERNATE_ALLELES = 6;
/**
* By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus.
* If This argument is true, the heterozygosity prior will not be used - main application is for population studies where prior might not be appropriate,
* as for example when the ancestral status of the reference allele is not known.
*/
@Advanced
@Argument(fullName = "dont_use_site_prior", shortName = "noPrior", doc = "If true, skip prior for variant discovery", required = false)
public boolean ignoreHeterozygosityPrior = false;
/**
* If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads.
* Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we
@ -180,5 +189,6 @@ public class StandardCallerArgumentCollection {
this.exactCallsLog = SCAC.exactCallsLog;
this.sampleContamination=SCAC.sampleContamination;
this.AFmodel = SCAC.AFmodel;
this.ignoreHeterozygosityPrior = SCAC.ignoreHeterozygosityPrior;
}
}

View File

@ -159,8 +159,8 @@ public class UnifiedGenotyperEngine {
this.N = samples.size() * ploidy;
log10AlleleFrequencyPriorsSNPs = new double[N+1];
log10AlleleFrequencyPriorsIndels = new double[N+1];
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity);
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY);
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity, UAC.ignoreHeterozygosityPrior);
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.ignoreHeterozygosityPrior);
filter.add(LOW_QUAL_FILTER_NAME);
@ -722,8 +722,20 @@ public class UnifiedGenotyperEngine {
return GGAmodel;
}
public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) {
/**
* Function that fills vector with allele frequency priors. By default, infinite-sites, neutral variation prior is used,
* where Pr(AC=i) = theta/i where theta is heterozygosity
* @param N Number of chromosomes
* @param priors (output) array to be filled with priors
* @param theta Heterozygosity
* @param ignorePriors If true, priors are ignored and zeros returned
*/
public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta, final boolean ignorePriors) {
if (ignorePriors) {
Arrays.fill(priors, 0,N,0.0);
return;
}
double sum = 0.0;
// for each i
@ -733,6 +745,10 @@ public class UnifiedGenotyperEngine {
sum += value;
}
// protection against the case of heterozygosity too high or an excessive number of samples (which break population genetics assumptions)
if (sum > 1.0) {
throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed - try reducing heterozygosity value or using the -noPrior argument");
}
// null frequency for AF=0 is (1 - sum(all other frequencies))
priors[0] = Math.log10(1.0 - sum);
}

View File

@ -111,7 +111,7 @@ public class AFCalcTestBuilder {
return MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
case human:
final double[] humanPriors = new double[nPriorValues];
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001);
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, false);
return humanPriors;
default:
throw new RuntimeException("Unexpected type " + priorType);

View File

@ -139,6 +139,15 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
executeTest("test confidence 1", spec1);
}
@Test
public void testNoPrior() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -noPrior", 1,
Arrays.asList("422656266117f8d01e17e5c491c49a24"));
executeTest("test no prior 1", spec1);
}
// --------------------------------------------------------------------------------------------------------------
//
// testing heterozygosity

View File

@ -176,7 +176,7 @@ public class AFCalcUnitTest extends BaseTest {
final int nPriorValues = 2*nSamples+1;
final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
final double[] humanPriors = new double[nPriorValues];
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001);
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, false);
for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) {
for ( AFCalc model : calcs ) {
@ -575,6 +575,35 @@ public class AFCalcUnitTest extends BaseTest {
return tests.toArray(new Object[][]{});
}
@Test(enabled = true, dataProvider = "Models")
public void testNoPrior(final AFCalc model) {
for ( int REF_PL = 10; REF_PL <= 20; REF_PL += 10 ) {
final Genotype AB = makePL(Arrays.asList(A,C), REF_PL, 0, 10000);
final double[] flatPriors = new double[]{0.0,0.0,0.0};
final double[] noPriors = new double[3];
// test that function computeAlleleFrequency correctly operates when the -noPrior option is set
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(2, noPriors, 0.001, true);
GetGLsTest cfgFlatPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "flatPrior");
GetGLsTest cfgNoPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "noPrior");
final AFCalcResult resultTrackerFlat = cfgFlatPrior.execute();
final AFCalcResult resultTrackerNoPrior = cfgNoPrior.execute();
final double pRefWithNoPrior = AB.getLikelihoods().getAsVector()[0];
final double pHetWithNoPrior = AB.getLikelihoods().getAsVector()[1] - Math.log10(0.5);
final double nonRefPost = Math.pow(10, pHetWithNoPrior) / (Math.pow(10, pRefWithNoPrior) + Math.pow(10, pHetWithNoPrior));
final double log10NonRefPost = Math.log10(nonRefPost);
if ( ! Double.isInfinite(log10NonRefPost) ) {
// check that the no-prior and flat-prior constructions yield same result
Assert.assertEquals(resultTrackerFlat.getLog10PosteriorOfAFGT0(), resultTrackerNoPrior.getLog10PosteriorOfAFGT0());
}
}
}
@Test(enabled = true && !DEBUG_ONLY, dataProvider = "Models")
public void testBiallelicPriors(final AFCalc model) {

View File

@ -29,18 +29,20 @@ import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.commandline.Argument;
/**
* Filters out reads whose length is >= some value.
* Filters out reads whose length is >= some value or < some value.
*
* @author mhanna
* @version 0.1
*/
public class MaxReadLengthFilter extends ReadFilter {
public class ReadLengthFilter extends ReadFilter {
@Argument(fullName = "maxReadLength", shortName = "maxRead", doc="Discard reads with length greater than the specified value", required=true)
private int maxReadLength;
@Argument(fullName = "minReadLength", shortName = "minRead", doc="Discard reads with length shorter than the specified value", required=true)
private int minReadLength = 1;
public boolean filterOut(SAMRecord read) {
// check the length
return read.getReadLength() > maxReadLength;
return read.getReadLength() > maxReadLength || read.getReadLength() < minReadLength;
}
}