Three important updates for Dindel genotyper:

a) Fix it up because it broke with a recent checkin to annotate vcf with unfiltered depth.
b) Printout of ref/alt alleles in output vcf was incorrect because the start/stop positions of associated GenomeLoc were incorrectly computed in case of a deletion.
c) Redid Beagle input/output walkers as not assume that ref was a single base, not to assume that variant was a vcf and generalized it to be indel-capable, so now the Beagle walkers can be used for indels as well.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4541 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
delangel 2010-10-21 16:00:16 +00:00
parent 990677ec36
commit cf9c9ae241
5 changed files with 63 additions and 32 deletions

View File

@ -152,7 +152,7 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation {
if (haplotypeList.size() > 0) {
Haplotype haplotypeR = haplotypeList.get(bestIdx);
Haplotype haplotypeA = haplotypeList.get(secondBestIdx);
//System.out.format("%d %d\n",bestIdx, secondBestIdx);
// Temp hack to match old implementation's scaling, TBD better behavior
return Arrays.asList(new Haplotype(haplotypeR.getBasesAsBytes(), 60), new Haplotype(haplotypeA.getBasesAsBytes(), contextSize));

View File

@ -115,9 +115,9 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
return 0;
GenomeLoc loc = context.getLocation();
VariantContext vc_input = tracker.getVariantContext(ref,INPUT_ROD_NAME, null, loc, false);
VariantContext vc_input = tracker.getVariantContext(ref,INPUT_ROD_NAME, null, loc, true);
VariantContext vc_comp = tracker.getVariantContext(ref,COMP_ROD_NAME, null, loc, false);
VariantContext vc_comp = tracker.getVariantContext(ref,COMP_ROD_NAME, null, loc, true);
if ( vc_input == null )
return 0;
@ -199,32 +199,38 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
ArrayList<String> beagleProbabilities = beagleProbsFeature.getProbLikelihoods().get(sample);
ArrayList<String> beagleGenotypePairs = beaglePhasedFeature.getGenotypes().get(sample);
// original alleles at this genotype
Allele originalAlleleA = g.getAllele(0);
Allele originalAlleleB = (g.getAlleles().size() == 2) ? g.getAllele(1) : g.getAllele(0); // hack to deal with no-call genotypes
// We have phased genotype in hp. Need to set the isRef field in the allele.
List<Allele> alleles = new ArrayList<Allele>();
String alleleA = beagleGenotypePairs.get(0);
String alleleB = beagleGenotypePairs.get(1);
byte[] r = alleleA.getBytes();
// Beagle always produces genotype strings based on the strings we input in the likelihood file.
String refString = vc_input.getReference().getDisplayString();
if (refString.length() == 0) // ref was null
refString = Allele.NULL_ALLELE_STRING;
//System.out.println(context.getLocation() + " : " + alleleA + " " + alleleB);
Allele bglAlleleA, bglAlleleB;
byte rA = r[0];
if (alleleA.matches(refString))
bglAlleleA = Allele.create(alleleA,true);
else
bglAlleleA = Allele.create(alleleA,false);
Boolean isRefA = (refByte == rA);
if (alleleB.matches(refString))
bglAlleleB = Allele.create(alleleB,true);
else
bglAlleleB = Allele.create(alleleB,false);
Allele refAllele = Allele.create(r, isRefA );
alleles.add(refAllele);
r = alleleB.getBytes();
byte rB = r[0];
Boolean isRefB = (refByte == rB);
Allele altAllele = Allele.create(r,isRefB);
alleles.add(altAllele);
alleles.add(bglAlleleA);
alleles.add(bglAlleleB);
// Compute new GQ field = -10*log10Pr(Genotype call is wrong)
// Beagle gives probability that genotype is AA, AB and BB.
@ -234,20 +240,22 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
Double hetProbability = Double.valueOf(beagleProbabilities.get(1));
Double homVarProbability = Double.valueOf(beagleProbabilities.get(2));
if (isRefA && isRefB) // HomRef call
if (bglAlleleA.isReference() && bglAlleleB.isReference()) // HomRef call
probWrongGenotype = hetProbability + homVarProbability;
else if ((isRefB && !isRefA) || (isRefA && !isRefB))
else if ((bglAlleleB.isReference() && bglAlleleA.isNonReference()) || (bglAlleleA.isReference() && bglAlleleB.isNonReference()))
probWrongGenotype = homRefProbability + homVarProbability;
else // HomVar call
probWrongGenotype = hetProbability + homRefProbability;
// deal with numerical errors coming from limited formatting value on Beagle output files
if (probWrongGenotype > 1 - MIN_PROB_ERROR)
probWrongGenotype = 1 - MIN_PROB_ERROR;
if (1-probWrongGenotype < noCallThreshold) {
// quality is bad: don't call genotype
alleles.clear();
refAllele = originalAlleleA;
altAllele = originalAlleleB;
alleles.add(refAllele);
alleles.add(altAllele);
alleles.add(originalAlleleA);
alleles.add(originalAlleleB);
genotypeIsPhased = false;
}
@ -277,8 +285,8 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
og = a1+"/"+a2;
// See if Beagle switched genotypes
if (!((refAllele.equals(originalAlleleA) && altAllele.equals(originalAlleleB) ||
(refAllele.equals(originalAlleleB) && altAllele.equals(originalAlleleA))))){
if (!((bglAlleleA.equals(originalAlleleA) && bglAlleleB.equals(originalAlleleB) ||
(bglAlleleA.equals(originalAlleleB) && bglAlleleB.equals(originalAlleleA))))){
originalAttributes.put("OG",og);
numGenotypesChangedByBeagle++;
}

View File

@ -109,8 +109,8 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
VariantContext variant_eval;
VariantContext validation_eval;
variant_eval = tracker.getVariantContext(ref, ROD_NAME, null, loc, false);
validation_eval = tracker.getVariantContext(ref,VALIDATION_ROD_NAME,null,loc,false);
variant_eval = tracker.getVariantContext(ref, ROD_NAME, null, loc, true);
validation_eval = tracker.getVariantContext(ref,VALIDATION_ROD_NAME,null,loc, true);
if ( goodSite(variant_eval,validation_eval) ) {
if ( useValidation(variant_eval,validation_eval, ref) ) {
writeBeagleOutput(validation_eval,variant_eval,true,validationPrior);
@ -136,7 +136,7 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
}
public boolean goodSite(VariantContext v) {
return v != null && ! v.isFiltered() && v.isSNP() && v.isBiallelic() && v.hasGenotypes();
return v != null && ! v.isFiltered() && v.isBiallelic() && v.hasGenotypes();
}
public boolean useValidation(VariantContext variant, VariantContext validation, ReferenceContext ref) {
@ -165,7 +165,8 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
}
public void writeBeagleOutput(VariantContext preferredVC, VariantContext otherVC, boolean isValidationSite, double prior) {
beagleWriter.print(String.format("%s ",VariantContextUtils.getLocation(preferredVC).toString()));
GenomeLoc currentLoc = VariantContextUtils.getLocation(preferredVC);
beagleWriter.print(String.format("%s:%d ",currentLoc.getContig(),currentLoc.getStart()));
if ( beagleGenotypesWriter != null ) {
beagleGenotypesWriter.print(String.format("%s ",VariantContextUtils.getLocation(preferredVC).toString()));
}
@ -173,9 +174,9 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
for ( Allele allele : preferredVC.getAlleles() ) {
String bglPrintString;
if (allele.isNoCall() || allele.isNull())
bglPrintString = "0";
bglPrintString = "-";
else
bglPrintString = allele.toString().substring(0,1); // get rid of * in case of reference allele
bglPrintString = allele.getBaseString(); // get rid of * in case of reference allele
beagleWriter.print(String.format("%s ", bglPrintString));
}

View File

@ -30,6 +30,7 @@ import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.GenotypeLikelihoods;
import org.broad.tribble.vcf.VCFConstants;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -224,7 +225,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
HashMap<String, Object> attributes = new HashMap<String, Object>();
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup()));
AlignmentContext context = contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
if (context.hasBasePileup())
attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(context.getBasePileup()));
else if (context.hasExtendedEventPileup())
attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(context.getExtendedEventPileup()));
double qual;
double[] posteriors = GLs.get(sample).getPosteriors();

View File

@ -264,9 +264,23 @@ public class UnifiedGenotyperEngine {
GenomeLoc loc = refContext.getLocus();
// todo - temp fix until we can deal with extended events properly
long endLoc;
// for indels, stop location is one more than ref allele length
boolean isSNP = true;
for (Allele a:alleles){
if (a.getBaseString().length() != 1) {
isSNP = false;
break;
}
}
if (isSNP)
endLoc = loc.getStart();
else
endLoc = loc.getStart() + refAllele.length();
//VariantContext vc = new VariantContext("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence, atTriggerTrack) ? null : filter, attributes);
VariantContext vc = new VariantContext("UG_call", loc.getContig(), loc.getStart(),
(refAllele.length() > 0 ? loc.getStart()+refAllele.length()-1 : loc.getStart()),
VariantContext vc = new VariantContext("UG_call", loc.getContig(), loc.getStart(), endLoc,
alleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence, atTriggerTrack) ? null : filter, attributes);