a) Add input argument (-ncrate) to BeagleOutputToVCFWalker. If the genotype posterior error probability is higher than this threshold, we declare No-call at this genotype.
b) Add "OG" annotation to genotypes. If Beagle changes genotypes, this annotation gets the original genotype call, to ease performance comparisons. If not, this annotation gets an empty value. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3723 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4213e05aeb
commit
be75b087ec
|
|
@ -60,6 +60,8 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
@Argument(fullName="output_file", shortName="output", doc="VCF file to which output should be written", required=true)
|
||||
private String OUTPUT_FILE = null;
|
||||
|
||||
@Argument(fullName="nocall_threshold", shortName="ncthr", doc="Threshold of confidence at which a genotype won't be called", required=false)
|
||||
private double noCallThreshold = 0.0;
|
||||
|
||||
public static final String INPUT_ROD_NAME = "inputvcf";
|
||||
|
||||
|
|
@ -79,7 +81,10 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
|
||||
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||
hInfo.add(new VCFInfoHeaderLine("R2", 1, VCFHeaderLineType.Float, "r2 Value reported by Beable on each site"));
|
||||
hInfo.add(new VCFFormatHeaderLine("OG",1,VCFHeaderLineType.String, "Original Genotype input to Beagle"));
|
||||
hInfo.add(new VCFInfoHeaderLine("R2", 1, VCFHeaderLineType.Float, "r2 Value reported by Beagle on each site"));
|
||||
hInfo.add(new VCFInfoHeaderLine("GenotypesChanged", 1, VCFHeaderLineType.Flag, "r2 Value reported by Beagle on each site"));
|
||||
|
||||
hInfo.add(new VCFHeaderLine("source", "BeagleImputation"));
|
||||
|
||||
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
|
||||
|
|
@ -146,10 +151,11 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
|
||||
|
||||
// for each genotype, create a new object with Beagle information on it
|
||||
for ( Map.Entry<String, Genotype> genotype : vc_input.getGenotypes().entrySet() ) {
|
||||
boolean genotypesChangedByBeagle = false;
|
||||
for ( Map.Entry<String, Genotype> originalGenotypes : vc_input.getGenotypes().entrySet() ) {
|
||||
|
||||
|
||||
Genotype g = genotype.getValue();
|
||||
Genotype g = originalGenotypes.getValue();
|
||||
Set<String> filters = new LinkedHashSet<String>(g.getFilters());
|
||||
|
||||
boolean genotypeIsPhased = true;
|
||||
|
|
@ -158,7 +164,9 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
ArrayList<String> beagleProbabilities = beagleProbsFeature.getProbLikelihoods().get(sample);
|
||||
ArrayList<String> beagleGenotypePairs = beaglePhasedFeature.getGenotypes().get(sample);
|
||||
|
||||
|
||||
Allele originalAlleleA = g.getAllele(0);
|
||||
Allele originalAlleleB = g.getAllele(1);
|
||||
|
||||
// We have phased genotype in hp. Need to set the isRef field in the allele.
|
||||
List<Allele> alleles = new ArrayList<Allele>();
|
||||
|
||||
|
|
@ -195,16 +203,55 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
else // HomVar call
|
||||
probWrongGenotype = hetProbability + homRefProbability;
|
||||
|
||||
|
||||
if (1-probWrongGenotype < noCallThreshold) {
|
||||
// quality is bad: don't call genotype
|
||||
alleles.clear();
|
||||
refAllele = Allele.NO_CALL;
|
||||
altAllele = Allele.NO_CALL;
|
||||
alleles.add(refAllele);
|
||||
alleles.add(altAllele);
|
||||
genotypeIsPhased = false;
|
||||
}
|
||||
|
||||
if (probWrongGenotype < MIN_PROB_ERROR)
|
||||
genotypeQuality = MAX_GENOTYPE_QUALITY;
|
||||
else
|
||||
genotypeQuality = -log10(probWrongGenotype);
|
||||
|
||||
Genotype imputedGenotype = new Genotype(genotype.getKey(), alleles, genotypeQuality, filters, g.getAttributes(), genotypeIsPhased);
|
||||
HashMap<String,Object> originalAttributes = new HashMap<String,Object>(g.getAttributes());
|
||||
|
||||
// get original encoding and add to keynotype attributes
|
||||
String a1, a2, og;
|
||||
if (originalAlleleA.isNoCall())
|
||||
a1 = ".";
|
||||
else if (originalAlleleA.isReference())
|
||||
a1 = "0";
|
||||
else
|
||||
a1 = "1";
|
||||
|
||||
if (originalAlleleB.isNoCall())
|
||||
a2 = ".";
|
||||
else if (originalAlleleB.isReference())
|
||||
a2 = "0";
|
||||
else
|
||||
a2 = "1";
|
||||
|
||||
og = a1+"/"+a2;
|
||||
|
||||
|
||||
genotypes.put(genotype.getKey(), imputedGenotype);
|
||||
// See if Beagle switched genotypes
|
||||
if (!((refAllele.equals(originalAlleleA) && altAllele.equals(originalAlleleB) ||
|
||||
(refAllele.equals(originalAlleleB) && altAllele.equals(originalAlleleA))))){
|
||||
genotypesChangedByBeagle = true;
|
||||
originalAttributes.put("OG",og);
|
||||
}
|
||||
else {
|
||||
originalAttributes.put("OG",".");
|
||||
}
|
||||
Genotype imputedGenotype = new Genotype(originalGenotypes.getKey(), alleles, genotypeQuality, filters,originalAttributes , genotypeIsPhased);
|
||||
|
||||
|
||||
genotypes.put(originalGenotypes.getKey(), imputedGenotype);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -231,6 +278,8 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
vcf.addInfoField("GenotypesChanged", (genotypesChangedByBeagle)? "1":"0" );
|
||||
vcf.addInfoField("R2", beagleR2Feature.getR2value().toString() );
|
||||
vcfWriter.addRecord(vcf);
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,114 @@
|
|||
#!/bin/bash
|
||||
|
||||
|
||||
chr=$1
|
||||
|
||||
# set basic input and output paths. User should modify the following to point to:
|
||||
# a) Input vcf,
|
||||
# b) Output base path for all files
|
||||
# c) Location of beagle.jar
|
||||
# d) Path to reference file
|
||||
beagle_prefix="CEUTSI.chr${chr}.bgl"
|
||||
beagledir="/broad/shptmp/username/beagle/CEUTSI_Pilot1/chr${chr}/"
|
||||
beaglejar="../../beagle/beagle.jar"
|
||||
|
||||
input_vcf="/broad/shptmp/username/beagle/CEUTSI_Pilot1/CEUTSI.recal.filtered.vcf"
|
||||
tmpdir="/broad/shptmp/username/tmp"
|
||||
bgloutprefix="recal.filtered"
|
||||
reffile="/broad/1KG/reference/human_g1k_v37.fasta"
|
||||
|
||||
|
||||
# Set to one to determine which sections of beagle pipeline to run
|
||||
runinput=1 # Run VCF to Beagle input converter
|
||||
runbgl=1 # Run Beagle
|
||||
runoutput=1 # run Beagle output-to-VCF converter
|
||||
runvarianteval=1 # Run Variant Evaluator to measure Beagle performance.
|
||||
|
||||
# Reference files for variant evaluator
|
||||
dohapmap=1
|
||||
do1kgchip=1
|
||||
|
||||
# Path to HapMap/1KG Chip truth sets for variant evaluator
|
||||
if [ $dohapmap == 1 ]
|
||||
then
|
||||
cmpfileh="/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.2/genotypes_r27_nr.hg19_fwd.vcf"
|
||||
fi
|
||||
|
||||
if [ $do1kgchip == 1 ]
|
||||
then
|
||||
cmpfile1="/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/1kg_chip_jan2010/1000Genome.chip.hg19.filtered.vcf"
|
||||
fi
|
||||
|
||||
outputbglvcf=$beagledir$bgloutprefix.$beagle_prefix.output.vcf
|
||||
|
||||
|
||||
# check if Beagle directory exists. If not, create it.
|
||||
if [ ! -d $beagledir ]
|
||||
then
|
||||
echo "Creating Beagle directory $beagledir"
|
||||
mkdir $beagledir
|
||||
fi
|
||||
|
||||
if [ $runinput == 1 ]
|
||||
then
|
||||
echo "Running GATK to create Beagle input"
|
||||
|
||||
java -Xmx4000m -jar ../dist/GenomeAnalysisTK.jar -L $chr -l INFO \
|
||||
-R $reffile -T ProduceBeagleInput \
|
||||
-B vcf,VCF,$input_vcf \
|
||||
-beagle $beagledir$beagle_prefix
|
||||
fi
|
||||
|
||||
# now, run beagle
|
||||
if [ $runbgl == 1 ]
|
||||
then
|
||||
echo "Running Beagle..."
|
||||
java -Xmx8000m -Djava.io.tmpdir=$tmpdir -jar $beaglejar like=$beagledir$beagle_prefix out=$bgloutprefix
|
||||
|
||||
# move output files to beagle directory
|
||||
cp ./$bgloutprefix.* $beagledir
|
||||
# unzip gzip'd files, force overwrite if existing
|
||||
gunzip -f $beagledir$bgloutprefix.$beagle_prefix.gprobs.gz
|
||||
gunzip -f $beagledir$bgloutprefix.$beagle_prefix.phased.gz
|
||||
#rename also Beagle likelihood file to mantain consistency
|
||||
cp $beagledir$beagle_prefix $beagledir$bgloutprefix.$beagle_prefix.like
|
||||
cp $beagledir$beagle_prefix.log $beagledir$bgloutprefix.$beagle_prefix.log
|
||||
fi
|
||||
|
||||
# run GATK to parse Beagle files and to produce output vcf
|
||||
if [ $runoutput == 1 ]
|
||||
then
|
||||
java -Xmx4000m -Djava.io.tmpdir=$tmpdir -jar ../dist/GenomeAnalysisTK.jar \
|
||||
-R $reffile -T BeagleOutputToVCF -l INFO -L $chr \
|
||||
-B inputvcf,VCF,$input_vcf \
|
||||
-B beagleR2,BEAGLE,$beagledir$bgloutprefix.$beagle_prefix.r2 \
|
||||
-B beagleProbs,BEAGLE,$beagledir$bgloutprefix.$beagle_prefix.gprobs \
|
||||
-B beaglePhased,BEAGLE,$beagledir$bgloutprefix.$beagle_prefix.phased \
|
||||
-output $beagledir$bgloutprefix.$beagle_prefix.output.vcf
|
||||
fi
|
||||
|
||||
if [ $runvarianteval == 1 ]
|
||||
then
|
||||
# finally, run VariantEval to produce useful comparisons between pre-and post-Beagle vcf's.
|
||||
if [ $dohapmap == 1 ]
|
||||
then
|
||||
java -Xmx4096m -jar ../dist/GenomeAnalysisTK.jar -T VariantEval \
|
||||
-R $reffile -l INFO -L $chr \
|
||||
-B eval_prebeagle,VCF,$input_vcf \
|
||||
-B eval_beagle,VCF,$outputbglvcf \
|
||||
-B comp_hapmap,VCF,$cmpfileh \
|
||||
-reportType Grep -o ${beagledir}$bgloutprefix.$beagle_prefix.variant_eval_hapmap_grep.txt
|
||||
fi
|
||||
|
||||
if [ $do1kgchip == 1 ]
|
||||
then
|
||||
java -Xmx4096m -jar ../dist/GenomeAnalysisTK.jar -T VariantEval \
|
||||
-R $reffile -l INFO -L $chr \
|
||||
-B eval_prebeagle,VCF,$input_vcf \
|
||||
-B eval_beagle,VCF,$outputbglvcf \
|
||||
-B comp_1kgchip,VCF,$cmpfile1 \
|
||||
-reportType Grep -o ${beagledir}$bgloutprefix.$beagle_prefix.variant_eval_1kgchip_grep.txt
|
||||
fi
|
||||
fi
|
||||
|
||||
|
||||
Loading…
Reference in New Issue