Still being tested trio-aware genotyper that calculates P(de novo)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3041 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-03-19 13:11:39 +00:00
parent 08d9ae403d
commit b221ce94ce
1 changed files with 172 additions and 0 deletions

View File

@ -0,0 +1,172 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import org.broadinstitute.sting.gatk.contexts.variantcontext.MutableVariantContext;
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.VariationRod;
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyper;
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.cmdLine.ArgumentCollection;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord;
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory;
import org.broadinstitute.sting.oneoffprojects.walkers.varianteval2.MendelianViolationEvaluator;
import java.util.*;
import java.io.File;
/**
* Implements an (STILL BEING TESTED) algorithm for calling SNPs in trios
*/
@By(DataSource.REFERENCE)
//@Requires(value={DataSource.REFERENCE, DataSource.REFERENCE_BASES, DataSource.READS},referenceMetaData={@RMD(name="sites",type= VariationRod.class)})
@Allows({DataSource.READS, DataSource.REFERENCE})
//, @RMD(name="parent1",type= VariationRod.class), @RMD(name="parent2",type= VariationRod.class)})
public class TrioGenotyperWalker extends RefWalker<VariantContext, Integer>{
@Argument(shortName="mom", doc="", required=true)
protected String mom;
@Argument(shortName="dad", doc="", required=true)
protected String dad;
@Argument(shortName="kid", doc="", required=true)
protected String kid;
@Argument(shortName="log10PriorOfDeNovoOfTrueVariant", doc="", required=false)
double LOG10_MENDEL_VIOLATION_PRIOR = -5; // 30 in 3B bases
@Argument(shortName = "varout", doc = "File to which variants should be written", required = true)
public String vcfOutputFile = null;
@ArgumentCollection
private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
UnifiedGenotyperEngine UGEngine = null;
private List<String> FAMILY_MEMBERS;
private VCFWriter writer = null;
public void initialize() {
UGEngine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, null, null);
// initialize the header
FAMILY_MEMBERS = Arrays.asList(mom, dad, kid);
// initialize the writer
writer = new VCFWriter(new File(vcfOutputFile));
}
public VariantContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
VariantContext vc = tracker.getVariantContext("variants", EnumSet.of(VariantContext.Type.SNP), context.getLocation(), true);
if ( vc != null && vc.isPolymorphic() ) {
if ( ! vc.hasGenotypes(FAMILY_MEMBERS) )
throw new StingException("variants file does not contain genotypes for everyone in family: " + FAMILY_MEMBERS);
VariantCallContext call = UGEngine.runGenotyper(tracker, ref, context);
// is call ever be null?
vc = annotateTrioPrior(vc, call.vc);
return vc;
} else {
return null;
}
}
private VariantContext annotateTrioPrior(VariantContext vcIn, VariantContext call) {
Genotype momG = call.getGenotype(mom);
Genotype dadG = call.getGenotype(dad);
Genotype kidG = call.getGenotype(kid);
double log10POfGenotype = Double.MIN_VALUE;
if ( MendelianViolationEvaluator.isViolation(call, momG, dadG, kidG) ) {
Allele R = call.getReference();
Allele A = call.getAlternateAllele(0);
List<List<Allele>> possibleGenotypes = Arrays.asList(Arrays.asList(R,R), Arrays.asList(R,A), Arrays.asList(A,A));
double[] L = new double[3 * 3 * 3];
int i = 0, bestIndex = 0;
double log10LOfBestGenotypes = Integer.MIN_VALUE;
for ( List<Allele> momPG : possibleGenotypes ) {
for ( List<Allele> dadPG : possibleGenotypes ) {
for ( List<Allele> kidPG : possibleGenotypes ) {
double log10LOfG = genotypeL(momPG, momG) + genotypeL(dadPG, dadG) + genotypeL(kidPG, kidG);
boolean isViolation = MendelianViolationEvaluator.isViolation(call, momPG, dadPG, kidPG);
double log10prior = isViolation ? LOG10_MENDEL_VIOLATION_PRIOR : 0;
L[i] = log10LOfG + log10prior;
if ( log10LOfG > log10LOfBestGenotypes ) {
bestIndex = i;
log10LOfBestGenotypes = log10LOfG;
}
logger.debug(String.format("%10s %10s => %10s : %b\t%.2f\t\t%.2f\t\t%3.2f", momPG, dadPG, kidPG, isViolation, log10LOfG, log10prior, L[i]));
i++;
}
}
}
double[] posteriors = MathUtils.log10posteriorsFromLog10L(L);
log10POfGenotype = posteriors[bestIndex];
}
//log10POfViolation = Math.min(log10POfViolation, 0);
double Q = QualityUtils.phredScaleCorrectRate(Math.pow(10, log10POfGenotype));
logger.debug(String.format("log10 P of best genotype log10 post = %.2f, Q = %.2f", log10POfGenotype, Q));
MutableVariantContext mvc = new MutableVariantContext(vcIn);
mvc.putAttribute("MVQ", Q);
return new VariantContext(mvc);
}
/**
* Isolate the rest of the walker from the code to get genotype likelihood values for allele A/B in genotypeCall
* @param alleles
* @param genotypeCall
* @return
*/
private double genotypeL( List<Allele> alleles, Genotype genotypeCall ) {
String postTriplet = (String)genotypeCall.getAttribute(VCFGenotypeRecord.GENOTYPE_POSTERIORS_TRIPLET_KEY);
if ( postTriplet == null )
throw new StingException("BUG: TrioGenotyperWalker expected genotype likelihood triplets " + VCFGenotypeRecord.GENOTYPE_POSTERIORS_TRIPLET_KEY);
// calculate the offset -- AA => 0, AB => 1, BB => 2
int i = 0;
for ( Allele a : alleles )
i += a.isNonReference() ? 1 : 0;
// convert the corresponding GL field to a double
String log10LStrings[] = postTriplet.split(",");
return Double.valueOf(log10LStrings[i]);
}
public Integer reduceInit() { return 0; }
public Integer reduce(VariantContext vc, Integer a) {
if ( vc != null ) {
if ( a == 0 )
writer.writeHeader(VariantContextAdaptors.createVCFHeader(null, vc));
writer.addRecord(VariantContextAdaptors.toVCF(vc, '.'));
a++;
}
return a;
}
public void onTraversalDone(Integer result) {} // Don't print the reduce result
}