a) Update annotations when creating new vcf with Beagle's imputed data. Since genotypes may (will) change based on imputation, several annotations need to be updated. By default, AC, AF, AN and AB will be updated. User can force extra annotaqtions to be updated with -A <annotation> argument.

b) Several cleanups and beautifications.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3499 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
delangel 2010-06-08 15:12:04 +00:00
parent 933133ee28
commit 907931c902
1 changed files with 63 additions and 18 deletions

View File

@ -29,6 +29,7 @@ import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.*; import org.broadinstitute.sting.gatk.contexts.variantcontext.*;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -37,6 +38,8 @@ import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.RMD; import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.genotype.vcf.VCFReader; import org.broadinstitute.sting.utils.genotype.vcf.VCFReader;
@ -46,7 +49,6 @@ import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
import java.io.*; import java.io.*;
import java.util.*; import java.util.*;
import static java.lang.Math.log10; import static java.lang.Math.log10;
import static java.lang.Math.pow;
/** /**
@ -63,6 +65,16 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
@Argument(fullName="output_file", shortName="output", doc="VCF file to which output should be written", required=true) @Argument(fullName="output_file", shortName="output", doc="VCF file to which output should be written", required=true)
private String OUTPUT_FILE = null; private String OUTPUT_FILE = null;
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
protected String[] annotationsToUse = {"AlleleBalance"};
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
protected String[] annotationClassesToUse = {};
@Argument(fullName="useAllAnnotations", shortName="all", doc="Use all possible annotations (not for the faint of heart)", required=false)
protected Boolean USE_ALL_ANNOTATIONS = false;
public static final String INPUT_ROD_NAME = "inputvcf"; public static final String INPUT_ROD_NAME = "inputvcf";
protected static BeagleFileReader gprobsReader = null; protected static BeagleFileReader gprobsReader = null;
@ -70,33 +82,37 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
protected static BeagleFileReader likeReader = null; protected static BeagleFileReader likeReader = null;
protected static BeagleFileReader r2Reader = null; protected static BeagleFileReader r2Reader = null;
private VariantAnnotatorEngine engine;
protected static String line = null; protected static String line = null;
protected HashMap<String,BeagleSampleRecord> beagleSampleRecords; protected HashMap<String,BeagleSampleRecord> beagleSampleRecords;
final TreeSet<String> samples = new TreeSet<String>(); final TreeSet<String> samples = new TreeSet<String>();
private final ArrayList<String> ALLOWED_FORMAT_FIELDS = new ArrayList<String>();
private final double MIN_PROB_ERROR = 0.000001; private final double MIN_PROB_ERROR = 0.000001;
private final double MAX_GENOTYPE_QUALITY = 6.0; private final double MAX_GENOTYPE_QUALITY = 6.0;
public void initialize() { public void initialize() {
if ( USE_ALL_ANNOTATIONS )
engine = new VariantAnnotatorEngine(getToolkit());
else
engine = new VariantAnnotatorEngine(getToolkit(), annotationClassesToUse, annotationsToUse);
// setup the header fields // setup the header fields
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>(); final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFInfoHeaderLine("R2", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "r2 Value reported by Beable on each site")); hInfo.add(new VCFInfoHeaderLine("R2", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "r2 Value reported by Beable on each site"));
hInfo.add(new VCFHeaderLine("source", "BeagleImputation")); hInfo.add(new VCFHeaderLine("source", "BeagleImputation"));
hInfo.addAll(engine.getVCFAnnotationDescriptions());
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources(); final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
// Open output file specified by output VCF ROD // Open output file specified by output VCF ROD
vcfWriter = new VCFWriter(new File(OUTPUT_FILE)); vcfWriter = new VCFWriter(new File(OUTPUT_FILE));
ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_KEY); // copied from VariantsToVCF
ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY);
ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.DEPTH_KEY);
ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_LIKELIHOODS_KEY);
for( final ReferenceOrderedDataSource source : dataSources ) { for( final ReferenceOrderedDataSource source : dataSources ) {
final RMDTrack rod = source.getReferenceOrderedData(); final RMDTrack rod = source.getReferenceOrderedData();
if( rod.getRecordType().equals(VCFRecord.class) && rod.getName().equalsIgnoreCase(INPUT_ROD_NAME)) { if( rod.getRecordType().equals(VCFRecord.class) && rod.getName().equalsIgnoreCase(INPUT_ROD_NAME)) {
@ -164,8 +180,6 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
LikelihoodTrios ltrio; LikelihoodTrios ltrio;
String markerKey = likeLine[0]; String markerKey = likeLine[0];
String alleleA = likeLine[1];
String alleleB = likeLine[2];
HashMap<String,LikelihoodTrios> genotypeLikelihoods = new HashMap<String,LikelihoodTrios>(); HashMap<String,LikelihoodTrios> genotypeLikelihoods = new HashMap<String,LikelihoodTrios>();
@ -208,7 +222,6 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
for (String[] r2Line: r2Reader.getDataSet()) for (String[] r2Line: r2Reader.getDataSet())
{ {
int j = 0;
String[] vals = r2Line[0].split("\t"); String[] vals = r2Line[0].split("\t");
String markerKey = vals[0]; String markerKey = vals[0];
@ -370,7 +383,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
String sample = g.getSampleName(); String sample = g.getSampleName();
LikelihoodTrios gtprobs = bglRecord.GenotypeProbabilities.get(sample); LikelihoodTrios gtprobs = bglRecord.GenotypeProbabilities.get(sample);
LikelihoodTrios inplike = bglRecord.InputLikelihoods.get(sample); // LikelihoodTrios inplike = bglRecord.InputLikelihoods.get(sample);
HaplotypePair hp = bglRecord.PhasedHaplotypes.get(sample); HaplotypePair hp = bglRecord.PhasedHaplotypes.get(sample);
// We have phased genotype in hp. Need to set the isRef field in the allele. // We have phased genotype in hp. Need to set the isRef field in the allele.
@ -378,7 +391,6 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
byte r[] = hp.AlleleA.getBases(); byte r[] = hp.AlleleA.getBases();
byte rA = r[0]; byte rA = r[0];
//System.out.print((char)r[0]);
Boolean isRefA = (refByte == rA); Boolean isRefA = (refByte == rA);
@ -387,7 +399,6 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
r = hp.AlleleB.getBases(); r = hp.AlleleB.getBases();
byte rB = r[0]; byte rB = r[0];
//System.out.println((char)r[0]);
Boolean isRefB = (refByte == rB); Boolean isRefB = (refByte == rB);
Allele altAllele = Allele.create(r,isRefB); Allele altAllele = Allele.create(r,isRefB);
@ -402,7 +413,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
if (isRefA && isRefB) // HomRef call if (isRefA && isRefB) // HomRef call
probWrongGenotype = gtprobs.HetLikelihood + gtprobs.HomVarLikelihood; probWrongGenotype = gtprobs.HetLikelihood + gtprobs.HomVarLikelihood;
else if ((isRefB && !isRefA) || (isRefA && !isRefB)) else if ((isRefB && !isRefA) || (isRefA && !isRefB))
probWrongGenotype = gtprobs.HomRefLikelihood + gtprobs.HomVarLikelihood; probWrongGenotype = gtprobs.HomRefLikelihood + gtprobs.HomVarLikelihood;
else // HomVar call else // HomVar call
probWrongGenotype = gtprobs.HetLikelihood + gtprobs.HomRefLikelihood; probWrongGenotype = gtprobs.HetLikelihood + gtprobs.HomRefLikelihood;
@ -419,14 +430,48 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
} }
VariantContext filteredVC = new VariantContext("outputvcf", vc_input.getLocation(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.getFilters(), vc_input.getAttributes());
VariantContext filteredVC = new MutableVariantContext(vc_input.getName(), vc_input.getLocation(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.getFilters(), vc_input.getAttributes()); Set<Allele> altAlleles = filteredVC.getAlternateAlleles();
StringBuffer altAlleleCountString = new StringBuffer();
for ( Allele allele : altAlleles ) {
if ( altAlleleCountString.length() > 0 )
altAlleleCountString.append(",");
altAlleleCountString.append(filteredVC.getChromosomeCount(allele));
}
VCFRecord vcf = VariantContextAdaptors.toVCF(filteredVC, ref.getBase(), null, false, false);
vcf.addInfoField("R2", ((Double)bglRecord.getR2Value()).toString() ); // if the reference base is not ambiguous, we can annotate
vcfWriter.addRecord(vcf); Collection<VariantContext> annotatedVCs = Arrays.asList(filteredVC);
Map<String, StratifiedAlignmentContext> stratifiedContexts;
if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) {
if ( ! context.hasExtendedEventPileup() ) {
stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(context.getBasePileup());
} else {
stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(context.getExtendedEventPileup());
}
if ( stratifiedContexts != null ) {
annotatedVCs = engine.annotateContext(tracker, ref, stratifiedContexts, filteredVC);
}
}
for(VariantContext annotatedVC : annotatedVCs ) {
VCFRecord vcf = VariantContextAdaptors.toVCF(filteredVC, ref.getBase());
if ( annotatedVC.getChromosomeCount() > 0 ) {
vcf.addInfoField(VCFRecord.ALLELE_NUMBER_KEY, String.format("%d", annotatedVC.getChromosomeCount()));
if ( altAlleleCountString.length() > 0 ) {
vcf.addInfoField(VCFRecord.ALLELE_COUNT_KEY, altAlleleCountString.toString());
vcf.addInfoField(VCFRecord.ALLELE_FREQUENCY_KEY, String.format("%4.2f",
Double.valueOf(altAlleleCountString.toString())/(annotatedVC.getChromosomeCount())));
}
}
vcf.addInfoField("R2", (bglRecord.getR2Value()).toString() );
vcfWriter.addRecord(vcf);
}
} }
return 1; return 1;