From be75b087eca556602b6dcd18a117ae70e7ea709c Mon Sep 17 00:00:00 2001 From: delangel Date: Tue, 6 Jul 2010 18:33:28 +0000 Subject: [PATCH] a) Add input argument (-ncrate) to BeagleOutputToVCFWalker. If the genotype posterior error probability is higher than this threshold, we declare No-call at this genotype. b) Add "OG" annotation to genotypes. If Beagle changes genotypes, this annotation gets the original genotype call, to ease performance comparisons. If not, this annotation gets an empty value. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3723 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/BeagleOutputToVCFWalker.java | 63 ++++++++-- shell/runbeagle.sh | 114 ++++++++++++++++++ 2 files changed, 170 insertions(+), 7 deletions(-) create mode 100755 shell/runbeagle.sh diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java index c2297698d..9faa38027 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java @@ -60,6 +60,8 @@ public class BeagleOutputToVCFWalker extends RodWalker { @Argument(fullName="output_file", shortName="output", doc="VCF file to which output should be written", required=true) private String OUTPUT_FILE = null; + @Argument(fullName="nocall_threshold", shortName="ncthr", doc="Threshold of confidence at which a genotype won't be called", required=false) + private double noCallThreshold = 0.0; public static final String INPUT_ROD_NAME = "inputvcf"; @@ -79,7 +81,10 @@ public class BeagleOutputToVCFWalker extends RodWalker { final Set hInfo = new HashSet(); hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); - hInfo.add(new VCFInfoHeaderLine("R2", 1, VCFHeaderLineType.Float, "r2 Value reported by Beable on each site")); + hInfo.add(new VCFFormatHeaderLine("OG",1,VCFHeaderLineType.String, "Original Genotype input to Beagle")); + hInfo.add(new VCFInfoHeaderLine("R2", 1, VCFHeaderLineType.Float, "r2 Value reported by Beagle on each site")); + hInfo.add(new VCFInfoHeaderLine("GenotypesChanged", 1, VCFHeaderLineType.Flag, "r2 Value reported by Beagle on each site")); + hInfo.add(new VCFHeaderLine("source", "BeagleImputation")); final List dataSources = this.getToolkit().getRodDataSources(); @@ -146,10 +151,11 @@ public class BeagleOutputToVCFWalker extends RodWalker { // for each genotype, create a new object with Beagle information on it - for ( Map.Entry genotype : vc_input.getGenotypes().entrySet() ) { + boolean genotypesChangedByBeagle = false; + for ( Map.Entry originalGenotypes : vc_input.getGenotypes().entrySet() ) { - Genotype g = genotype.getValue(); + Genotype g = originalGenotypes.getValue(); Set filters = new LinkedHashSet(g.getFilters()); boolean genotypeIsPhased = true; @@ -158,7 +164,9 @@ public class BeagleOutputToVCFWalker extends RodWalker { ArrayList beagleProbabilities = beagleProbsFeature.getProbLikelihoods().get(sample); ArrayList beagleGenotypePairs = beaglePhasedFeature.getGenotypes().get(sample); - + Allele originalAlleleA = g.getAllele(0); + Allele originalAlleleB = g.getAllele(1); + // We have phased genotype in hp. Need to set the isRef field in the allele. List alleles = new ArrayList(); @@ -195,16 +203,55 @@ public class BeagleOutputToVCFWalker extends RodWalker { else // HomVar call probWrongGenotype = hetProbability + homRefProbability; - + if (1-probWrongGenotype < noCallThreshold) { + // quality is bad: don't call genotype + alleles.clear(); + refAllele = Allele.NO_CALL; + altAllele = Allele.NO_CALL; + alleles.add(refAllele); + alleles.add(altAllele); + genotypeIsPhased = false; + } + if (probWrongGenotype < MIN_PROB_ERROR) genotypeQuality = MAX_GENOTYPE_QUALITY; else genotypeQuality = -log10(probWrongGenotype); - Genotype imputedGenotype = new Genotype(genotype.getKey(), alleles, genotypeQuality, filters, g.getAttributes(), genotypeIsPhased); + HashMap originalAttributes = new HashMap(g.getAttributes()); + + // get original encoding and add to keynotype attributes + String a1, a2, og; + if (originalAlleleA.isNoCall()) + a1 = "."; + else if (originalAlleleA.isReference()) + a1 = "0"; + else + a1 = "1"; + + if (originalAlleleB.isNoCall()) + a2 = "."; + else if (originalAlleleB.isReference()) + a2 = "0"; + else + a2 = "1"; + + og = a1+"/"+a2; - genotypes.put(genotype.getKey(), imputedGenotype); + // See if Beagle switched genotypes + if (!((refAllele.equals(originalAlleleA) && altAllele.equals(originalAlleleB) || + (refAllele.equals(originalAlleleB) && altAllele.equals(originalAlleleA))))){ + genotypesChangedByBeagle = true; + originalAttributes.put("OG",og); + } + else { + originalAttributes.put("OG","."); + } + Genotype imputedGenotype = new Genotype(originalGenotypes.getKey(), alleles, genotypeQuality, filters,originalAttributes , genotypeIsPhased); + + + genotypes.put(originalGenotypes.getKey(), imputedGenotype); } @@ -231,6 +278,8 @@ public class BeagleOutputToVCFWalker extends RodWalker { } } + + vcf.addInfoField("GenotypesChanged", (genotypesChangedByBeagle)? "1":"0" ); vcf.addInfoField("R2", beagleR2Feature.getR2value().toString() ); vcfWriter.addRecord(vcf); diff --git a/shell/runbeagle.sh b/shell/runbeagle.sh new file mode 100755 index 000000000..8dce11421 --- /dev/null +++ b/shell/runbeagle.sh @@ -0,0 +1,114 @@ +#!/bin/bash + + +chr=$1 + +# set basic input and output paths. User should modify the following to point to: +# a) Input vcf, +# b) Output base path for all files +# c) Location of beagle.jar +# d) Path to reference file +beagle_prefix="CEUTSI.chr${chr}.bgl" +beagledir="/broad/shptmp/username/beagle/CEUTSI_Pilot1/chr${chr}/" +beaglejar="../../beagle/beagle.jar" + +input_vcf="/broad/shptmp/username/beagle/CEUTSI_Pilot1/CEUTSI.recal.filtered.vcf" +tmpdir="/broad/shptmp/username/tmp" +bgloutprefix="recal.filtered" +reffile="/broad/1KG/reference/human_g1k_v37.fasta" + + +# Set to one to determine which sections of beagle pipeline to run +runinput=1 # Run VCF to Beagle input converter +runbgl=1 # Run Beagle +runoutput=1 # run Beagle output-to-VCF converter +runvarianteval=1 # Run Variant Evaluator to measure Beagle performance. + +# Reference files for variant evaluator +dohapmap=1 +do1kgchip=1 + +# Path to HapMap/1KG Chip truth sets for variant evaluator +if [ $dohapmap == 1 ] +then + cmpfileh="/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.2/genotypes_r27_nr.hg19_fwd.vcf" +fi + +if [ $do1kgchip == 1 ] +then + cmpfile1="/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/1kg_chip_jan2010/1000Genome.chip.hg19.filtered.vcf" +fi + +outputbglvcf=$beagledir$bgloutprefix.$beagle_prefix.output.vcf + + +# check if Beagle directory exists. If not, create it. +if [ ! -d $beagledir ] +then + echo "Creating Beagle directory $beagledir" + mkdir $beagledir +fi + +if [ $runinput == 1 ] +then + echo "Running GATK to create Beagle input" + + java -Xmx4000m -jar ../dist/GenomeAnalysisTK.jar -L $chr -l INFO \ + -R $reffile -T ProduceBeagleInput \ + -B vcf,VCF,$input_vcf \ + -beagle $beagledir$beagle_prefix +fi + +# now, run beagle +if [ $runbgl == 1 ] +then + echo "Running Beagle..." + java -Xmx8000m -Djava.io.tmpdir=$tmpdir -jar $beaglejar like=$beagledir$beagle_prefix out=$bgloutprefix + + # move output files to beagle directory + cp ./$bgloutprefix.* $beagledir + # unzip gzip'd files, force overwrite if existing + gunzip -f $beagledir$bgloutprefix.$beagle_prefix.gprobs.gz + gunzip -f $beagledir$bgloutprefix.$beagle_prefix.phased.gz + #rename also Beagle likelihood file to mantain consistency + cp $beagledir$beagle_prefix $beagledir$bgloutprefix.$beagle_prefix.like + cp $beagledir$beagle_prefix.log $beagledir$bgloutprefix.$beagle_prefix.log +fi + +# run GATK to parse Beagle files and to produce output vcf +if [ $runoutput == 1 ] +then + java -Xmx4000m -Djava.io.tmpdir=$tmpdir -jar ../dist/GenomeAnalysisTK.jar \ + -R $reffile -T BeagleOutputToVCF -l INFO -L $chr \ + -B inputvcf,VCF,$input_vcf \ + -B beagleR2,BEAGLE,$beagledir$bgloutprefix.$beagle_prefix.r2 \ + -B beagleProbs,BEAGLE,$beagledir$bgloutprefix.$beagle_prefix.gprobs \ + -B beaglePhased,BEAGLE,$beagledir$bgloutprefix.$beagle_prefix.phased \ + -output $beagledir$bgloutprefix.$beagle_prefix.output.vcf +fi + +if [ $runvarianteval == 1 ] +then + # finally, run VariantEval to produce useful comparisons between pre-and post-Beagle vcf's. + if [ $dohapmap == 1 ] + then + java -Xmx4096m -jar ../dist/GenomeAnalysisTK.jar -T VariantEval \ + -R $reffile -l INFO -L $chr \ + -B eval_prebeagle,VCF,$input_vcf \ + -B eval_beagle,VCF,$outputbglvcf \ + -B comp_hapmap,VCF,$cmpfileh \ + -reportType Grep -o ${beagledir}$bgloutprefix.$beagle_prefix.variant_eval_hapmap_grep.txt + fi + + if [ $do1kgchip == 1 ] + then + java -Xmx4096m -jar ../dist/GenomeAnalysisTK.jar -T VariantEval \ + -R $reffile -l INFO -L $chr \ + -B eval_prebeagle,VCF,$input_vcf \ + -B eval_beagle,VCF,$outputbglvcf \ + -B comp_1kgchip,VCF,$cmpfile1 \ + -reportType Grep -o ${beagledir}$bgloutprefix.$beagle_prefix.variant_eval_1kgchip_grep.txt + fi +fi + +