From 5992b79159307ab1fa8c6b9af862845aabd625a6 Mon Sep 17 00:00:00 2001 From: delangel Date: Mon, 12 Jul 2010 19:54:58 +0000 Subject: [PATCH] a) Simplify normalization code in ProduceBeagleInputWalker, as to always normalize, and use MathUtils.normalizeFromLog10 to do this. b) Several improvements to BeagleOutputToVCFWalker: 1. If a Hapmap input track is provided (e.g. -B comp,VCF,file), Hapmap sites will be annotated with Hapmap Allele count and allele frequency (key ACH, AFH). 2. If probability of correct genotype is lower than ncthr (optional argument provided by user, default = 0.0), walker will keep original calls instead of using Beagle calls. 3. Instead of annotating just whether Beagle had modified a site, annotate instead HOW MANY genotypes in a site were actually changed by Beagle. All three improvements are mostly for debugging and analysis only. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3769 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/BeagleOutputToVCFWalker.java | 130 +++++++++++++----- .../walkers/ProduceBeagleInputWalker.java | 18 +-- 2 files changed, 103 insertions(+), 45 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java index d228de697..6ed1d0078 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java @@ -29,9 +29,11 @@ import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.*; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.features.beagle.BeagleFeature; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.RMD; import org.broadinstitute.sting.gatk.walkers.Requires; @@ -54,6 +56,10 @@ import static java.lang.Math.log10; public class BeagleOutputToVCFWalker extends RodWalker { public static final String INPUT_ROD_NAME = "variant"; + public static final String COMP_ROD_NAME = "comp"; + public static final String R2_ROD_NAME = "beagleR2"; + public static final String PROBS_ROD_NAME = "beagleProbs"; + public static final String PHASED_ROD_NAME = "beaglePhased"; @Argument(fullName="output_file", shortName="output", doc="VCF file to which output should be written", required=true) private String OUTPUT_FILE = null; @@ -64,7 +70,7 @@ public class BeagleOutputToVCFWalker extends RodWalker { protected static String line = null; private VCFWriter vcfWriter; - // protected HashMap beagleSampleRecords; + private final double MIN_PROB_ERROR = 0.000001; private final double MAX_GENOTYPE_QUALITY = 6.0; @@ -77,14 +83,27 @@ public class BeagleOutputToVCFWalker extends RodWalker { hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); hInfo.add(new VCFFormatHeaderLine("OG",1, VCFHeaderLineType.String, "Original Genotype input to Beagle")); hInfo.add(new VCFInfoHeaderLine("R2", 1, VCFHeaderLineType.Float, "r2 Value reported by Beagle on each site")); - hInfo.add(new VCFInfoHeaderLine("GenotypesChanged", 1, VCFHeaderLineType.Flag, "r2 Value reported by Beagle on each site")); + hInfo.add(new VCFInfoHeaderLine("NumGenotypesChanged", 1, VCFHeaderLineType.Integer, "r2 Value reported by Beagle on each site")); hInfo.add(new VCFHeaderLine("source", "BeagleImputation")); // Open output file specified by output VCF ROD vcfWriter = new VCFWriter(new File(OUTPUT_FILE), true); + final List dataSources = this.getToolkit().getRodDataSources(); + + for( final ReferenceOrderedDataSource source : dataSources ) { + final RMDTrack rod = source.getReferenceOrderedData(); + + if (rod.getRecordType().equals(VCFRecord.class) && rod.getName().equalsIgnoreCase(COMP_ROD_NAME)) { + hInfo.add(new VCFInfoHeaderLine("ACH", 1, VCFHeaderLineType.Integer, "Allele Count from Hapmap at this site")); + hInfo.add(new VCFInfoHeaderLine("ANH", 1, VCFHeaderLineType.Integer, "Allele Frequency from Hapmap at this site")); + hInfo.add(new VCFInfoHeaderLine("AFH", 1, VCFHeaderLineType.Float, "Allele Number from Hapmap at this site")); + } + + } Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(INPUT_ROD_NAME)); + final VCFHeader vcfHeader = new VCFHeader(hInfo, samples); vcfWriter.writeHeader(vcfHeader); } @@ -95,11 +114,14 @@ public class BeagleOutputToVCFWalker extends RodWalker { return 0; GenomeLoc loc = context.getLocation(); - VariantContext vc_input = tracker.getVariantContext(ref,"inputvcf", null, loc, false); + VariantContext vc_input = tracker.getVariantContext(ref,INPUT_ROD_NAME, null, loc, false); + + VariantContext vc_comp = tracker.getVariantContext(ref,COMP_ROD_NAME, null, loc, false); + if ( vc_input == null ) return 0; - List r2rods = tracker.getReferenceMetaData("beagleR2"); + List r2rods = tracker.getReferenceMetaData(R2_ROD_NAME); // ignore places where we don't have a variant if ( r2rods.size() == 0 ) @@ -107,7 +129,7 @@ public class BeagleOutputToVCFWalker extends RodWalker { BeagleFeature beagleR2Feature = (BeagleFeature)r2rods.get(0); - List gProbsrods = tracker.getReferenceMetaData("beagleProbs"); + List gProbsrods = tracker.getReferenceMetaData(PROBS_ROD_NAME); // ignore places where we don't have a variant if ( gProbsrods.size() == 0 ) @@ -115,7 +137,7 @@ public class BeagleOutputToVCFWalker extends RodWalker { BeagleFeature beagleProbsFeature = (BeagleFeature)gProbsrods.get(0); - List gPhasedrods = tracker.getReferenceMetaData("beaglePhased"); + List gPhasedrods = tracker.getReferenceMetaData(PHASED_ROD_NAME); // ignore places where we don't have a variant if ( gPhasedrods.size() == 0 ) @@ -131,7 +153,17 @@ public class BeagleOutputToVCFWalker extends RodWalker { // for each genotype, create a new object with Beagle information on it - boolean genotypesChangedByBeagle = false; + + int numGenotypesChangedByBeagle = 0; + Integer alleleCountH = 0, chrCountH = 0; + Double alleleFrequencyH = 0.0; + + Map hapmapGenotypes = null; + + if (vc_comp != null) { + hapmapGenotypes = vc_comp.getGenotypes(); + } + for ( Map.Entry originalGenotypes : vc_input.getGenotypes().entrySet() ) { Genotype g = originalGenotypes.getValue(); @@ -140,6 +172,24 @@ public class BeagleOutputToVCFWalker extends RodWalker { boolean genotypeIsPhased = true; String sample = g.getSampleName(); + // If we have a Hapmap (comp) ROD, compute Hapmap AC, AN and AF + // use sample as key into genotypes structure + if (vc_comp != null) { + + if (vc_input.getGenotypes().containsKey(sample) && hapmapGenotypes.containsKey(sample)) { + + Genotype hapmapGenotype = hapmapGenotypes.get(sample); + if (hapmapGenotype.isCalled()){ + chrCountH += 2; + if (hapmapGenotype.isHet()) { + alleleCountH += 1; + } else if (hapmapGenotype.isHomVar()) { + alleleCountH += 2; + } + } + } + } + ArrayList beagleProbabilities = beagleProbsFeature.getProbLikelihoods().get(sample); ArrayList beagleGenotypePairs = beaglePhasedFeature.getGenotypes().get(sample); @@ -185,8 +235,8 @@ public class BeagleOutputToVCFWalker extends RodWalker { if (1-probWrongGenotype < noCallThreshold) { // quality is bad: don't call genotype alleles.clear(); - refAllele = Allele.NO_CALL; - altAllele = Allele.NO_CALL; + refAllele = originalAlleleA; + altAllele = originalAlleleB; alleles.add(refAllele); alleles.add(altAllele); genotypeIsPhased = false; @@ -219,10 +269,10 @@ public class BeagleOutputToVCFWalker extends RodWalker { // See if Beagle switched genotypes if (!((refAllele.equals(originalAlleleA) && altAllele.equals(originalAlleleB) || - (refAllele.equals(originalAlleleB) && altAllele.equals(originalAlleleA))))){ - genotypesChangedByBeagle = true; + (refAllele.equals(originalAlleleB) && altAllele.equals(originalAlleleA))))){ originalAttributes.put("OG",og); - } + numGenotypesChangedByBeagle++; + } else { originalAttributes.put("OG","."); } @@ -231,34 +281,46 @@ public class BeagleOutputToVCFWalker extends RodWalker { genotypes.put(originalGenotypes.getKey(), imputedGenotype); - } + } - VariantContext filteredVC = new VariantContext("outputvcf", vc_input.getLocation(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.getFilters(), vc_input.getAttributes()); + VariantContext filteredVC = new VariantContext("outputvcf", vc_input.getLocation(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.getFilters(), vc_input.getAttributes()); - Set altAlleles = filteredVC.getAlternateAlleles(); - StringBuffer altAlleleCountString = new StringBuffer(); - for ( Allele allele : altAlleles ) { - if ( altAlleleCountString.length() > 0 ) - altAlleleCountString.append(","); - altAlleleCountString.append(filteredVC.getChromosomeCount(allele)); - } + Set altAlleles = filteredVC.getAlternateAlleles(); + StringBuffer altAlleleCountString = new StringBuffer(); + for ( Allele allele : altAlleles ) { + if ( altAlleleCountString.length() > 0 ) + altAlleleCountString.append(","); + altAlleleCountString.append(filteredVC.getChromosomeCount(allele)); + } - HashMap attributes = new HashMap(filteredVC.getAttributes()); - if ( filteredVC.getChromosomeCount() > 0 ) { - attributes.put(VCFConstants.ALLELE_NUMBER_KEY, String.format("%d", filteredVC.getChromosomeCount())); - if ( altAlleleCountString.length() > 0 ) { - attributes.put(VCFConstants.ALLELE_COUNT_KEY, altAlleleCountString.toString()); - attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, String.format("%4.2f", - Double.valueOf(altAlleleCountString.toString())/(filteredVC.getChromosomeCount()))); - } - } + HashMap attributes = new HashMap(filteredVC.getAttributes()); + if ( filteredVC.getChromosomeCount() > 0 ) { + attributes.put(VCFConstants.ALLELE_NUMBER_KEY, String.format("%d", filteredVC.getChromosomeCount())); + if ( altAlleleCountString.length() > 0 ) { + attributes.put(VCFConstants.ALLELE_COUNT_KEY, altAlleleCountString.toString()); + attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, String.format("%4.2f", + Double.valueOf(altAlleleCountString.toString())/(filteredVC.getChromosomeCount()))); + } + } - attributes.put("GenotypesChanged", (genotypesChangedByBeagle)? "1":"0" ); - attributes.put("R2", beagleR2Feature.getR2value().toString() ); + // Get Hapmap AC and AF + if (vc_comp != null) { + attributes.put("ACH", alleleCountH.toString() ); + attributes.put("ANH", chrCountH.toString() ); + attributes.put("AFH", String.format("%4.2f", (double)alleleCountH/chrCountH) ); - vcfWriter.add(VariantContextUtils.modifyAttributes(filteredVC, attributes), new byte[]{ref.getBase()}); + } + + attributes.put("NumGenotypesChanged", numGenotypesChangedByBeagle ); + attributes.put("R2", beagleR2Feature.getR2value().toString() ); + + + + vcfWriter.add(VariantContextUtils.modifyAttributes(filteredVC, attributes), new byte[]{ref.getBase()}); + + + return 1; - return 1; } public Integer reduceInit() { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/ProduceBeagleInputWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ProduceBeagleInputWalker.java index 0694e827f..041db79fc 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/ProduceBeagleInputWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ProduceBeagleInputWalker.java @@ -113,25 +113,21 @@ public class ProduceBeagleInputWalker extends RodWalker { if (genotype.isCalled() && genotype.hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) { String[] glArray = genotype.getAttributeAsString(VCFConstants.GENOTYPE_LIKELIHOODS_KEY).split(","); - Double maxLikelihood = -100.0; - ArrayList likeArray = new ArrayList(); + double[] likeArray = new double[glArray.length]; + // convert to double array so we can normalize + int k=0; for (String gl : glArray) { - // need to normalize likelihoods to avoid precision loss. In worst case, if all 3 log-likelihoods are too - // small, we could end up with linear likelihoods of form 0.00 0.00 0.00 which will mess up imputation. - Double dg = Double.valueOf(gl); - if (dg> maxLikelihood) - maxLikelihood = dg; - - likeArray.add(dg); + likeArray[k++] = Double.valueOf(gl); } + double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(likeArray); // see if we need to randomly mask out genotype in this position. Double d = generator.nextDouble(); if (d > insertedNoCallRate ) { // System.out.format("%5.4f ", d); - for (Double likeVal: likeArray) - beagleWriter.print(String.format("%5.4f ",Math.pow(10, likeVal-maxLikelihood))); + for (Double likeVal: normalizedLikelihoods) + beagleWriter.print(String.format("%5.4f ",likeVal)); } else { // we are masking out this genotype