a) Add input argument (-ncrate) to BeagleOutputToVCFWalker. If the genotype posterior error probability is higher than this threshold, we declare No-call at this genotype.

b) Add "OG" annotation to genotypes. If Beagle changes genotypes, this annotation gets the original genotype call, to ease performance  comparisons. If not, this annotation gets an empty value.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3723 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
delangel 2010-07-06 18:33:28 +00:00
parent 4213e05aeb
commit be75b087ec
2 changed files with 170 additions and 7 deletions

View File

@ -60,6 +60,8 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
@Argument(fullName="output_file", shortName="output", doc="VCF file to which output should be written", required=true)
private String OUTPUT_FILE = null;
@Argument(fullName="nocall_threshold", shortName="ncthr", doc="Threshold of confidence at which a genotype won't be called", required=false)
private double noCallThreshold = 0.0;
public static final String INPUT_ROD_NAME = "inputvcf";
@ -79,7 +81,10 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFInfoHeaderLine("R2", 1, VCFHeaderLineType.Float, "r2 Value reported by Beable on each site"));
hInfo.add(new VCFFormatHeaderLine("OG",1,VCFHeaderLineType.String, "Original Genotype input to Beagle"));
hInfo.add(new VCFInfoHeaderLine("R2", 1, VCFHeaderLineType.Float, "r2 Value reported by Beagle on each site"));
hInfo.add(new VCFInfoHeaderLine("GenotypesChanged", 1, VCFHeaderLineType.Flag, "r2 Value reported by Beagle on each site"));
hInfo.add(new VCFHeaderLine("source", "BeagleImputation"));
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
@ -146,10 +151,11 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
// for each genotype, create a new object with Beagle information on it
for ( Map.Entry<String, Genotype> genotype : vc_input.getGenotypes().entrySet() ) {
boolean genotypesChangedByBeagle = false;
for ( Map.Entry<String, Genotype> originalGenotypes : vc_input.getGenotypes().entrySet() ) {
Genotype g = genotype.getValue();
Genotype g = originalGenotypes.getValue();
Set<String> filters = new LinkedHashSet<String>(g.getFilters());
boolean genotypeIsPhased = true;
@ -158,7 +164,9 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
ArrayList<String> beagleProbabilities = beagleProbsFeature.getProbLikelihoods().get(sample);
ArrayList<String> beagleGenotypePairs = beaglePhasedFeature.getGenotypes().get(sample);
Allele originalAlleleA = g.getAllele(0);
Allele originalAlleleB = g.getAllele(1);
// We have phased genotype in hp. Need to set the isRef field in the allele.
List<Allele> alleles = new ArrayList<Allele>();
@ -195,16 +203,55 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
else // HomVar call
probWrongGenotype = hetProbability + homRefProbability;
if (1-probWrongGenotype < noCallThreshold) {
// quality is bad: don't call genotype
alleles.clear();
refAllele = Allele.NO_CALL;
altAllele = Allele.NO_CALL;
alleles.add(refAllele);
alleles.add(altAllele);
genotypeIsPhased = false;
}
if (probWrongGenotype < MIN_PROB_ERROR)
genotypeQuality = MAX_GENOTYPE_QUALITY;
else
genotypeQuality = -log10(probWrongGenotype);
Genotype imputedGenotype = new Genotype(genotype.getKey(), alleles, genotypeQuality, filters, g.getAttributes(), genotypeIsPhased);
HashMap<String,Object> originalAttributes = new HashMap<String,Object>(g.getAttributes());
// get original encoding and add to keynotype attributes
String a1, a2, og;
if (originalAlleleA.isNoCall())
a1 = ".";
else if (originalAlleleA.isReference())
a1 = "0";
else
a1 = "1";
if (originalAlleleB.isNoCall())
a2 = ".";
else if (originalAlleleB.isReference())
a2 = "0";
else
a2 = "1";
og = a1+"/"+a2;
genotypes.put(genotype.getKey(), imputedGenotype);
// See if Beagle switched genotypes
if (!((refAllele.equals(originalAlleleA) && altAllele.equals(originalAlleleB) ||
(refAllele.equals(originalAlleleB) && altAllele.equals(originalAlleleA))))){
genotypesChangedByBeagle = true;
originalAttributes.put("OG",og);
}
else {
originalAttributes.put("OG",".");
}
Genotype imputedGenotype = new Genotype(originalGenotypes.getKey(), alleles, genotypeQuality, filters,originalAttributes , genotypeIsPhased);
genotypes.put(originalGenotypes.getKey(), imputedGenotype);
}
@ -231,6 +278,8 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
}
}
vcf.addInfoField("GenotypesChanged", (genotypesChangedByBeagle)? "1":"0" );
vcf.addInfoField("R2", beagleR2Feature.getR2value().toString() );
vcfWriter.addRecord(vcf);

114
shell/runbeagle.sh 100755
View File

@ -0,0 +1,114 @@
#!/bin/bash
chr=$1
# set basic input and output paths. User should modify the following to point to:
# a) Input vcf,
# b) Output base path for all files
# c) Location of beagle.jar
# d) Path to reference file
beagle_prefix="CEUTSI.chr${chr}.bgl"
beagledir="/broad/shptmp/username/beagle/CEUTSI_Pilot1/chr${chr}/"
beaglejar="../../beagle/beagle.jar"
input_vcf="/broad/shptmp/username/beagle/CEUTSI_Pilot1/CEUTSI.recal.filtered.vcf"
tmpdir="/broad/shptmp/username/tmp"
bgloutprefix="recal.filtered"
reffile="/broad/1KG/reference/human_g1k_v37.fasta"
# Set to one to determine which sections of beagle pipeline to run
runinput=1 # Run VCF to Beagle input converter
runbgl=1 # Run Beagle
runoutput=1 # run Beagle output-to-VCF converter
runvarianteval=1 # Run Variant Evaluator to measure Beagle performance.
# Reference files for variant evaluator
dohapmap=1
do1kgchip=1
# Path to HapMap/1KG Chip truth sets for variant evaluator
if [ $dohapmap == 1 ]
then
cmpfileh="/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.2/genotypes_r27_nr.hg19_fwd.vcf"
fi
if [ $do1kgchip == 1 ]
then
cmpfile1="/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/1kg_chip_jan2010/1000Genome.chip.hg19.filtered.vcf"
fi
outputbglvcf=$beagledir$bgloutprefix.$beagle_prefix.output.vcf
# check if Beagle directory exists. If not, create it.
if [ ! -d $beagledir ]
then
echo "Creating Beagle directory $beagledir"
mkdir $beagledir
fi
if [ $runinput == 1 ]
then
echo "Running GATK to create Beagle input"
java -Xmx4000m -jar ../dist/GenomeAnalysisTK.jar -L $chr -l INFO \
-R $reffile -T ProduceBeagleInput \
-B vcf,VCF,$input_vcf \
-beagle $beagledir$beagle_prefix
fi
# now, run beagle
if [ $runbgl == 1 ]
then
echo "Running Beagle..."
java -Xmx8000m -Djava.io.tmpdir=$tmpdir -jar $beaglejar like=$beagledir$beagle_prefix out=$bgloutprefix
# move output files to beagle directory
cp ./$bgloutprefix.* $beagledir
# unzip gzip'd files, force overwrite if existing
gunzip -f $beagledir$bgloutprefix.$beagle_prefix.gprobs.gz
gunzip -f $beagledir$bgloutprefix.$beagle_prefix.phased.gz
#rename also Beagle likelihood file to mantain consistency
cp $beagledir$beagle_prefix $beagledir$bgloutprefix.$beagle_prefix.like
cp $beagledir$beagle_prefix.log $beagledir$bgloutprefix.$beagle_prefix.log
fi
# run GATK to parse Beagle files and to produce output vcf
if [ $runoutput == 1 ]
then
java -Xmx4000m -Djava.io.tmpdir=$tmpdir -jar ../dist/GenomeAnalysisTK.jar \
-R $reffile -T BeagleOutputToVCF -l INFO -L $chr \
-B inputvcf,VCF,$input_vcf \
-B beagleR2,BEAGLE,$beagledir$bgloutprefix.$beagle_prefix.r2 \
-B beagleProbs,BEAGLE,$beagledir$bgloutprefix.$beagle_prefix.gprobs \
-B beaglePhased,BEAGLE,$beagledir$bgloutprefix.$beagle_prefix.phased \
-output $beagledir$bgloutprefix.$beagle_prefix.output.vcf
fi
if [ $runvarianteval == 1 ]
then
# finally, run VariantEval to produce useful comparisons between pre-and post-Beagle vcf's.
if [ $dohapmap == 1 ]
then
java -Xmx4096m -jar ../dist/GenomeAnalysisTK.jar -T VariantEval \
-R $reffile -l INFO -L $chr \
-B eval_prebeagle,VCF,$input_vcf \
-B eval_beagle,VCF,$outputbglvcf \
-B comp_hapmap,VCF,$cmpfileh \
-reportType Grep -o ${beagledir}$bgloutprefix.$beagle_prefix.variant_eval_hapmap_grep.txt
fi
if [ $do1kgchip == 1 ]
then
java -Xmx4096m -jar ../dist/GenomeAnalysisTK.jar -T VariantEval \
-R $reffile -l INFO -L $chr \
-B eval_prebeagle,VCF,$input_vcf \
-B eval_beagle,VCF,$outputbglvcf \
-B comp_1kgchip,VCF,$cmpfile1 \
-reportType Grep -o ${beagledir}$bgloutprefix.$beagle_prefix.variant_eval_1kgchip_grep.txt
fi
fi