diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java index c2297698d..9faa38027 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java @@ -60,6 +60,8 @@ public class BeagleOutputToVCFWalker extends RodWalker { @Argument(fullName="output_file", shortName="output", doc="VCF file to which output should be written", required=true) private String OUTPUT_FILE = null; + @Argument(fullName="nocall_threshold", shortName="ncthr", doc="Threshold of confidence at which a genotype won't be called", required=false) + private double noCallThreshold = 0.0; public static final String INPUT_ROD_NAME = "inputvcf"; @@ -79,7 +81,10 @@ public class BeagleOutputToVCFWalker extends RodWalker { final Set hInfo = new HashSet(); hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); - hInfo.add(new VCFInfoHeaderLine("R2", 1, VCFHeaderLineType.Float, "r2 Value reported by Beable on each site")); + hInfo.add(new VCFFormatHeaderLine("OG",1,VCFHeaderLineType.String, "Original Genotype input to Beagle")); + hInfo.add(new VCFInfoHeaderLine("R2", 1, VCFHeaderLineType.Float, "r2 Value reported by Beagle on each site")); + hInfo.add(new VCFInfoHeaderLine("GenotypesChanged", 1, VCFHeaderLineType.Flag, "r2 Value reported by Beagle on each site")); + hInfo.add(new VCFHeaderLine("source", "BeagleImputation")); final List dataSources = this.getToolkit().getRodDataSources(); @@ -146,10 +151,11 @@ public class BeagleOutputToVCFWalker extends RodWalker { // for each genotype, create a new object with Beagle information on it - for ( Map.Entry genotype : vc_input.getGenotypes().entrySet() ) { + boolean genotypesChangedByBeagle = false; + for ( Map.Entry originalGenotypes : vc_input.getGenotypes().entrySet() ) { - Genotype g = genotype.getValue(); + Genotype g = originalGenotypes.getValue(); Set filters = new LinkedHashSet(g.getFilters()); boolean genotypeIsPhased = true; @@ -158,7 +164,9 @@ public class BeagleOutputToVCFWalker extends RodWalker { ArrayList beagleProbabilities = beagleProbsFeature.getProbLikelihoods().get(sample); ArrayList beagleGenotypePairs = beaglePhasedFeature.getGenotypes().get(sample); - + Allele originalAlleleA = g.getAllele(0); + Allele originalAlleleB = g.getAllele(1); + // We have phased genotype in hp. Need to set the isRef field in the allele. List alleles = new ArrayList(); @@ -195,16 +203,55 @@ public class BeagleOutputToVCFWalker extends RodWalker { else // HomVar call probWrongGenotype = hetProbability + homRefProbability; - + if (1-probWrongGenotype < noCallThreshold) { + // quality is bad: don't call genotype + alleles.clear(); + refAllele = Allele.NO_CALL; + altAllele = Allele.NO_CALL; + alleles.add(refAllele); + alleles.add(altAllele); + genotypeIsPhased = false; + } + if (probWrongGenotype < MIN_PROB_ERROR) genotypeQuality = MAX_GENOTYPE_QUALITY; else genotypeQuality = -log10(probWrongGenotype); - Genotype imputedGenotype = new Genotype(genotype.getKey(), alleles, genotypeQuality, filters, g.getAttributes(), genotypeIsPhased); + HashMap originalAttributes = new HashMap(g.getAttributes()); + + // get original encoding and add to keynotype attributes + String a1, a2, og; + if (originalAlleleA.isNoCall()) + a1 = "."; + else if (originalAlleleA.isReference()) + a1 = "0"; + else + a1 = "1"; + + if (originalAlleleB.isNoCall()) + a2 = "."; + else if (originalAlleleB.isReference()) + a2 = "0"; + else + a2 = "1"; + + og = a1+"/"+a2; - genotypes.put(genotype.getKey(), imputedGenotype); + // See if Beagle switched genotypes + if (!((refAllele.equals(originalAlleleA) && altAllele.equals(originalAlleleB) || + (refAllele.equals(originalAlleleB) && altAllele.equals(originalAlleleA))))){ + genotypesChangedByBeagle = true; + originalAttributes.put("OG",og); + } + else { + originalAttributes.put("OG","."); + } + Genotype imputedGenotype = new Genotype(originalGenotypes.getKey(), alleles, genotypeQuality, filters,originalAttributes , genotypeIsPhased); + + + genotypes.put(originalGenotypes.getKey(), imputedGenotype); } @@ -231,6 +278,8 @@ public class BeagleOutputToVCFWalker extends RodWalker { } } + + vcf.addInfoField("GenotypesChanged", (genotypesChangedByBeagle)? "1":"0" ); vcf.addInfoField("R2", beagleR2Feature.getR2value().toString() ); vcfWriter.addRecord(vcf); diff --git a/shell/runbeagle.sh b/shell/runbeagle.sh new file mode 100755 index 000000000..8dce11421 --- /dev/null +++ b/shell/runbeagle.sh @@ -0,0 +1,114 @@ +#!/bin/bash + + +chr=$1 + +# set basic input and output paths. User should modify the following to point to: +# a) Input vcf, +# b) Output base path for all files +# c) Location of beagle.jar +# d) Path to reference file +beagle_prefix="CEUTSI.chr${chr}.bgl" +beagledir="/broad/shptmp/username/beagle/CEUTSI_Pilot1/chr${chr}/" +beaglejar="../../beagle/beagle.jar" + +input_vcf="/broad/shptmp/username/beagle/CEUTSI_Pilot1/CEUTSI.recal.filtered.vcf" +tmpdir="/broad/shptmp/username/tmp" +bgloutprefix="recal.filtered" +reffile="/broad/1KG/reference/human_g1k_v37.fasta" + + +# Set to one to determine which sections of beagle pipeline to run +runinput=1 # Run VCF to Beagle input converter +runbgl=1 # Run Beagle +runoutput=1 # run Beagle output-to-VCF converter +runvarianteval=1 # Run Variant Evaluator to measure Beagle performance. + +# Reference files for variant evaluator +dohapmap=1 +do1kgchip=1 + +# Path to HapMap/1KG Chip truth sets for variant evaluator +if [ $dohapmap == 1 ] +then + cmpfileh="/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.2/genotypes_r27_nr.hg19_fwd.vcf" +fi + +if [ $do1kgchip == 1 ] +then + cmpfile1="/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/1kg_chip_jan2010/1000Genome.chip.hg19.filtered.vcf" +fi + +outputbglvcf=$beagledir$bgloutprefix.$beagle_prefix.output.vcf + + +# check if Beagle directory exists. If not, create it. +if [ ! -d $beagledir ] +then + echo "Creating Beagle directory $beagledir" + mkdir $beagledir +fi + +if [ $runinput == 1 ] +then + echo "Running GATK to create Beagle input" + + java -Xmx4000m -jar ../dist/GenomeAnalysisTK.jar -L $chr -l INFO \ + -R $reffile -T ProduceBeagleInput \ + -B vcf,VCF,$input_vcf \ + -beagle $beagledir$beagle_prefix +fi + +# now, run beagle +if [ $runbgl == 1 ] +then + echo "Running Beagle..." + java -Xmx8000m -Djava.io.tmpdir=$tmpdir -jar $beaglejar like=$beagledir$beagle_prefix out=$bgloutprefix + + # move output files to beagle directory + cp ./$bgloutprefix.* $beagledir + # unzip gzip'd files, force overwrite if existing + gunzip -f $beagledir$bgloutprefix.$beagle_prefix.gprobs.gz + gunzip -f $beagledir$bgloutprefix.$beagle_prefix.phased.gz + #rename also Beagle likelihood file to mantain consistency + cp $beagledir$beagle_prefix $beagledir$bgloutprefix.$beagle_prefix.like + cp $beagledir$beagle_prefix.log $beagledir$bgloutprefix.$beagle_prefix.log +fi + +# run GATK to parse Beagle files and to produce output vcf +if [ $runoutput == 1 ] +then + java -Xmx4000m -Djava.io.tmpdir=$tmpdir -jar ../dist/GenomeAnalysisTK.jar \ + -R $reffile -T BeagleOutputToVCF -l INFO -L $chr \ + -B inputvcf,VCF,$input_vcf \ + -B beagleR2,BEAGLE,$beagledir$bgloutprefix.$beagle_prefix.r2 \ + -B beagleProbs,BEAGLE,$beagledir$bgloutprefix.$beagle_prefix.gprobs \ + -B beaglePhased,BEAGLE,$beagledir$bgloutprefix.$beagle_prefix.phased \ + -output $beagledir$bgloutprefix.$beagle_prefix.output.vcf +fi + +if [ $runvarianteval == 1 ] +then + # finally, run VariantEval to produce useful comparisons between pre-and post-Beagle vcf's. + if [ $dohapmap == 1 ] + then + java -Xmx4096m -jar ../dist/GenomeAnalysisTK.jar -T VariantEval \ + -R $reffile -l INFO -L $chr \ + -B eval_prebeagle,VCF,$input_vcf \ + -B eval_beagle,VCF,$outputbglvcf \ + -B comp_hapmap,VCF,$cmpfileh \ + -reportType Grep -o ${beagledir}$bgloutprefix.$beagle_prefix.variant_eval_hapmap_grep.txt + fi + + if [ $do1kgchip == 1 ] + then + java -Xmx4096m -jar ../dist/GenomeAnalysisTK.jar -T VariantEval \ + -R $reffile -l INFO -L $chr \ + -B eval_prebeagle,VCF,$input_vcf \ + -B eval_beagle,VCF,$outputbglvcf \ + -B comp_1kgchip,VCF,$cmpfile1 \ + -reportType Grep -o ${beagledir}$bgloutprefix.$beagle_prefix.variant_eval_1kgchip_grep.txt + fi +fi + +