Improvements and bug fixes galore. (1) Now properly handles Q0 bases, filtering them out, you can disable this if you need to (2) support for three-state base probabilities (see email), which is disabled by default (still experimental) but appears to be more emppowered to detect variants (see email too)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1327 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-07-28 13:21:38 +00:00
parent 46643d3724
commit df4fd498c5
1 changed files with 25 additions and 7 deletions

View File

@ -38,6 +38,9 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confidient genotypes or just the variants?", required = false)
public boolean GENOTYPE = false;
@Argument(fullName = "3BaseErrors", shortName = "3BaseErrors", doc = "Should we use a 3-base error mode (so that P(b_true != b_called | e) == e / 3?", required = false)
public boolean THREE_BASE_ERRORS = false;
// Control periodic reporting features
@Argument(fullName = "metrics_interval", shortName = "metint", doc = "Number of loci to process between metrics reports", required = false) public Integer METRICS_INTERVAL = 50000;
@Argument(fullName = "suppress_metrics", shortName = "printmets", doc = "If specified, don't display metrics", required = false) public Boolean SUPPRESS_METRICS = false;
@ -47,7 +50,7 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
@Argument(fullName = "call_indels", shortName = "indels", doc = "Call indels as well as point mutations", required = false) public Boolean CALL_INDELS = false;
// Control how the genotype hypotheses are weighed
@Argument(fullName = "priors_any_locus", shortName = "plocus", doc = "Comma-separated prior likelihoods for any locus (homref,het,homvar)", required = false) public String PRIORS_ANY_LOCUS = "0.999,1e-3,1e-5";
@Argument(fullName = "priors_locus", shortName = "plocus", doc = "Comma-separated prior likelihoods for any locus (homref,het,homvar)", required = false) public String PRIORS_ANY_LOCUS = "0.999,1e-3,1e-5";
@Argument(fullName = "priors_hapmap", shortName = "phapmap", doc = "Comma-separated prior likelihoods for Hapmap loci (homref,het,homvar)", required = false) public String PRIORS_HAPMAP = "0.999,1e-3,1e-5";
@Argument(fullName = "priors_dbsnp", shortName = "pdbsnp", doc = "Comma-separated prior likelihoods for dbSNP loci (homref,het,homvar)", required = false) public String PRIORS_DBSNP = "0.999,1e-3,1e-5";
@Argument(fullName = "priors_2nd_on", shortName = "p2ndon", doc = "Comma-separated prior likelihoods for the secondary bases of primary on-genotype bases (AA,AC,AG,AT,CC,CG,CT,GG,GT,TT)", required = false) public String PRIORS_2ND_ON = "0.000,0.302,0.366,0.142,0.000,0.548,0.370,0.000,0.319,0.000";
@ -56,6 +59,9 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
// Control various sample-level settings
@Argument(fullName = "sample_name_regex", shortName = "sample_name_regex", doc = "Replaces the sample name specified in the BAM read group with the value supplied here", required = false) public String SAMPLE_NAME_REGEX = null;
@Argument(fullName = "keepQ0Bases", shortName = "keepQ0Bases", doc = "If true, then Q0 bases will be included in the genotyping calculation, and treated as Q1 -- this is really not a good idea", required = false)
public boolean keepQ0Bases = false;
public AlleleMetrics metricsOut;
public PrintStream variantsOut;
public String sampleName;
@ -67,6 +73,8 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
public double[] p2ndon;
public double[] p2ndoff;
public long nFilteredQ0Bases = 0;
/**
* Specify that this walker requires reads.
*
@ -225,21 +233,30 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
* @return the likelihoods per genotype
*/
private GenotypeLikelihoods callGenotype(RefMetaDataTracker tracker, char ref, ReadBackedPileup pileup, List<SAMRecord> reads, List<Integer> offsets) {
GenotypeLikelihoods G;
GenotypeLikelihoods G = null;
if (isHapmapSite(tracker)) {
G = new GenotypeLikelihoods(phapmap[0], phapmap[1], phapmap[2], p2ndon, p2ndoff);
G = new GenotypeLikelihoods(THREE_BASE_ERRORS, phapmap[0], phapmap[1], phapmap[2], p2ndon, p2ndoff);
} else if (isDbSNPSite(tracker)) {
G = new GenotypeLikelihoods(pdbsnp[0], pdbsnp[1], pdbsnp[2], p2ndon, p2ndoff);
G = new GenotypeLikelihoods(THREE_BASE_ERRORS, pdbsnp[0], pdbsnp[1], pdbsnp[2], p2ndon, p2ndoff);
} else {
G = new GenotypeLikelihoods(plocus[0], plocus[1], plocus[2], p2ndon, p2ndoff);
G = new GenotypeLikelihoods(THREE_BASE_ERRORS, plocus[0], plocus[1], plocus[2], p2ndon, p2ndoff);
}
G.filterQ0Bases(! keepQ0Bases); // Set the filtering / keeping flag
for (int i = 0; i < reads.size(); i++) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset]);
char base = read.getReadString().charAt(offset);
byte qual = read.getBaseQualities()[offset];
int nBasesAdded = G.add(ref, base, qual);
if ( nBasesAdded == 0 ) {
nFilteredQ0Bases++;
//System.out.printf("Filtering Q0 base from %s at %s %d: %c %c %d%n",
// read.getReadName(), GenomeLocParser.createGenomeLoc(read), offset, ref, base, qual);
}
}
G.ApplyPrior(ref, this.alt_allele, this.allele_frequency_prior);
@ -352,7 +369,8 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
}
/** Close the variant file. */
public void onTraversalDone(String sum) {
public void onTraversalDone(String sum) {
logger.info(String.format("SingleSampleGenotyper filtered %d Q0 bases", nFilteredQ0Bases));
if (this.VAR_FORMAT == GenotypeWriterFactory.GENOTYPE_FORMAT.GLF) {
mGenotypeWriter.close();
} else {