From cf9c9ae2419845a9f177eda6f8dc1777168a5a51 Mon Sep 17 00:00:00 2001 From: delangel Date: Thu, 21 Oct 2010 16:00:16 +0000 Subject: [PATCH] Three important updates for Dindel genotyper: a) Fix it up because it broke with a recent checkin to annotate vcf with unfiltered depth. b) Printout of ref/alt alleles in output vcf was incorrect because the start/stop positions of associated GenomeLoc were incorrectly computed in case of a deletion. c) Redid Beagle input/output walkers as not assume that ref was a single base, not to assume that variant was a vcf and generalized it to be indel-capable, so now the Beagle walkers can be used for indels as well. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4541 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/annotator/HaplotypeScore.java | 2 +- .../beagle/BeagleOutputToVCFWalker.java | 52 +++++++++++-------- .../beagle/ProduceBeagleInputWalker.java | 13 ++--- .../genotyper/ExactAFCalculationModel.java | 10 +++- .../genotyper/UnifiedGenotyperEngine.java | 18 ++++++- 5 files changed, 63 insertions(+), 32 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index 8861d1acc..b3288d738 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -152,7 +152,7 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { if (haplotypeList.size() > 0) { Haplotype haplotypeR = haplotypeList.get(bestIdx); Haplotype haplotypeA = haplotypeList.get(secondBestIdx); - + //System.out.format("%d %d\n",bestIdx, secondBestIdx); // Temp hack to match old implementation's scaling, TBD better behavior return Arrays.asList(new Haplotype(haplotypeR.getBasesAsBytes(), 60), new Haplotype(haplotypeA.getBasesAsBytes(), contextSize)); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java index 3979a16c2..dfd7077fe 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java @@ -115,9 +115,9 @@ public class BeagleOutputToVCFWalker extends RodWalker { return 0; GenomeLoc loc = context.getLocation(); - VariantContext vc_input = tracker.getVariantContext(ref,INPUT_ROD_NAME, null, loc, false); + VariantContext vc_input = tracker.getVariantContext(ref,INPUT_ROD_NAME, null, loc, true); - VariantContext vc_comp = tracker.getVariantContext(ref,COMP_ROD_NAME, null, loc, false); + VariantContext vc_comp = tracker.getVariantContext(ref,COMP_ROD_NAME, null, loc, true); if ( vc_input == null ) return 0; @@ -199,32 +199,38 @@ public class BeagleOutputToVCFWalker extends RodWalker { ArrayList beagleProbabilities = beagleProbsFeature.getProbLikelihoods().get(sample); ArrayList beagleGenotypePairs = beaglePhasedFeature.getGenotypes().get(sample); + // original alleles at this genotype Allele originalAlleleA = g.getAllele(0); + Allele originalAlleleB = (g.getAlleles().size() == 2) ? g.getAllele(1) : g.getAllele(0); // hack to deal with no-call genotypes + // We have phased genotype in hp. Need to set the isRef field in the allele. List alleles = new ArrayList(); String alleleA = beagleGenotypePairs.get(0); String alleleB = beagleGenotypePairs.get(1); - byte[] r = alleleA.getBytes(); + // Beagle always produces genotype strings based on the strings we input in the likelihood file. + String refString = vc_input.getReference().getDisplayString(); + if (refString.length() == 0) // ref was null + refString = Allele.NULL_ALLELE_STRING; - //System.out.println(context.getLocation() + " : " + alleleA + " " + alleleB); + Allele bglAlleleA, bglAlleleB; - byte rA = r[0]; + if (alleleA.matches(refString)) + bglAlleleA = Allele.create(alleleA,true); + else + bglAlleleA = Allele.create(alleleA,false); - Boolean isRefA = (refByte == rA); + if (alleleB.matches(refString)) + bglAlleleB = Allele.create(alleleB,true); + else + bglAlleleB = Allele.create(alleleB,false); - Allele refAllele = Allele.create(r, isRefA ); - alleles.add(refAllele); - r = alleleB.getBytes(); - byte rB = r[0]; - - Boolean isRefB = (refByte == rB); - Allele altAllele = Allele.create(r,isRefB); - alleles.add(altAllele); + alleles.add(bglAlleleA); + alleles.add(bglAlleleB); // Compute new GQ field = -10*log10Pr(Genotype call is wrong) // Beagle gives probability that genotype is AA, AB and BB. @@ -234,20 +240,22 @@ public class BeagleOutputToVCFWalker extends RodWalker { Double hetProbability = Double.valueOf(beagleProbabilities.get(1)); Double homVarProbability = Double.valueOf(beagleProbabilities.get(2)); - if (isRefA && isRefB) // HomRef call + if (bglAlleleA.isReference() && bglAlleleB.isReference()) // HomRef call probWrongGenotype = hetProbability + homVarProbability; - else if ((isRefB && !isRefA) || (isRefA && !isRefB)) + else if ((bglAlleleB.isReference() && bglAlleleA.isNonReference()) || (bglAlleleA.isReference() && bglAlleleB.isNonReference())) probWrongGenotype = homRefProbability + homVarProbability; else // HomVar call probWrongGenotype = hetProbability + homRefProbability; + // deal with numerical errors coming from limited formatting value on Beagle output files + if (probWrongGenotype > 1 - MIN_PROB_ERROR) + probWrongGenotype = 1 - MIN_PROB_ERROR; + if (1-probWrongGenotype < noCallThreshold) { // quality is bad: don't call genotype alleles.clear(); - refAllele = originalAlleleA; - altAllele = originalAlleleB; - alleles.add(refAllele); - alleles.add(altAllele); + alleles.add(originalAlleleA); + alleles.add(originalAlleleB); genotypeIsPhased = false; } @@ -277,8 +285,8 @@ public class BeagleOutputToVCFWalker extends RodWalker { og = a1+"/"+a2; // See if Beagle switched genotypes - if (!((refAllele.equals(originalAlleleA) && altAllele.equals(originalAlleleB) || - (refAllele.equals(originalAlleleB) && altAllele.equals(originalAlleleA))))){ + if (!((bglAlleleA.equals(originalAlleleA) && bglAlleleB.equals(originalAlleleB) || + (bglAlleleA.equals(originalAlleleB) && bglAlleleB.equals(originalAlleleA))))){ originalAttributes.put("OG",og); numGenotypesChangedByBeagle++; } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java index e8133ae34..d64acdbac 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java @@ -109,8 +109,8 @@ public class ProduceBeagleInputWalker extends RodWalker { VariantContext variant_eval; VariantContext validation_eval; - variant_eval = tracker.getVariantContext(ref, ROD_NAME, null, loc, false); - validation_eval = tracker.getVariantContext(ref,VALIDATION_ROD_NAME,null,loc,false); + variant_eval = tracker.getVariantContext(ref, ROD_NAME, null, loc, true); + validation_eval = tracker.getVariantContext(ref,VALIDATION_ROD_NAME,null,loc, true); if ( goodSite(variant_eval,validation_eval) ) { if ( useValidation(variant_eval,validation_eval, ref) ) { writeBeagleOutput(validation_eval,variant_eval,true,validationPrior); @@ -136,7 +136,7 @@ public class ProduceBeagleInputWalker extends RodWalker { } public boolean goodSite(VariantContext v) { - return v != null && ! v.isFiltered() && v.isSNP() && v.isBiallelic() && v.hasGenotypes(); + return v != null && ! v.isFiltered() && v.isBiallelic() && v.hasGenotypes(); } public boolean useValidation(VariantContext variant, VariantContext validation, ReferenceContext ref) { @@ -165,7 +165,8 @@ public class ProduceBeagleInputWalker extends RodWalker { } public void writeBeagleOutput(VariantContext preferredVC, VariantContext otherVC, boolean isValidationSite, double prior) { - beagleWriter.print(String.format("%s ",VariantContextUtils.getLocation(preferredVC).toString())); + GenomeLoc currentLoc = VariantContextUtils.getLocation(preferredVC); + beagleWriter.print(String.format("%s:%d ",currentLoc.getContig(),currentLoc.getStart())); if ( beagleGenotypesWriter != null ) { beagleGenotypesWriter.print(String.format("%s ",VariantContextUtils.getLocation(preferredVC).toString())); } @@ -173,9 +174,9 @@ public class ProduceBeagleInputWalker extends RodWalker { for ( Allele allele : preferredVC.getAlleles() ) { String bglPrintString; if (allele.isNoCall() || allele.isNull()) - bglPrintString = "0"; + bglPrintString = "-"; else - bglPrintString = allele.toString().substring(0,1); // get rid of * in case of reference allele + bglPrintString = allele.getBaseString(); // get rid of * in case of reference allele beagleWriter.print(String.format("%s ", bglPrintString)); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/ExactAFCalculationModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/ExactAFCalculationModel.java index 5b5a860f6..a3f42ec33 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -30,6 +30,7 @@ import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.util.variantcontext.Genotype; import org.broad.tribble.util.variantcontext.GenotypeLikelihoods; import org.broad.tribble.vcf.VCFConstants; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -224,7 +225,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { HashMap attributes = new HashMap(); ArrayList myAlleles = new ArrayList(); - attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup())); + AlignmentContext context = contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE); + + if (context.hasBasePileup()) + attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(context.getBasePileup())); + + else if (context.hasExtendedEventPileup()) + attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(context.getExtendedEventPileup())); + double qual; double[] posteriors = GLs.get(sample).getPosteriors(); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 6cd1bb7a8..48ef10b81 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -264,9 +264,23 @@ public class UnifiedGenotyperEngine { GenomeLoc loc = refContext.getLocus(); // todo - temp fix until we can deal with extended events properly + long endLoc; + // for indels, stop location is one more than ref allele length + boolean isSNP = true; + for (Allele a:alleles){ + if (a.getBaseString().length() != 1) { + isSNP = false; + break; + } + } + + if (isSNP) + endLoc = loc.getStart(); + else + endLoc = loc.getStart() + refAllele.length(); + //VariantContext vc = new VariantContext("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence, atTriggerTrack) ? null : filter, attributes); - VariantContext vc = new VariantContext("UG_call", loc.getContig(), loc.getStart(), - (refAllele.length() > 0 ? loc.getStart()+refAllele.length()-1 : loc.getStart()), + VariantContext vc = new VariantContext("UG_call", loc.getContig(), loc.getStart(), endLoc, alleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence, atTriggerTrack) ? null : filter, attributes);