a) First fully functional (sort of) version of walker that parses Beagle imputation output files and produce a vcf with imputed genotypes.

More doc/info to follow shortly. Issues still to be solved:
a) Walker changes all genotypes based on Beagle data, but annotations on the original VCF are unchanged. They should in theory be recomputed based on new genotypes.
b) Current implementation is ugly, dirty unwieldy and will necessitate a refactoring soon so I can keep my pride. Most aesthetically affronting issue right now is that we read the full Beagle files at initialization and keep them in memory, but a more delicate implementation would just read from files on a marker by marker basis. Issue that currently prevents this is that BufferedReader() instances don't seem to play nice when called from the map() function.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3488 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
delangel 2010-06-04 20:37:25 +00:00
parent b811e61ae1
commit ef47a69c50
2 changed files with 491 additions and 4 deletions

View File

@ -0,0 +1,483 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.*;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.genotype.vcf.VCFReader;
import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils;
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
import java.io.*;
import java.util.*;
import static java.lang.Math.log10;
import static java.lang.Math.pow;
/**
* Takes files produced by Beagle imputation engine and creates a vcf with modified annotations.
*/
@Requires(value={},referenceMetaData=@RMD(name=BeagleOutputToVCFWalker.INPUT_ROD_NAME,type= VCFRecord.class))
public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
private VCFWriter vcfWriter;
@Argument(fullName="input_prefix", shortName="input", doc="The prefix added to input Beagle files gprobs, r2, ...", required=true)
private String INPUT_PREFIX = "beagle";
@Argument(fullName="output_file", shortName="output", doc="VCF file to which output should be written", required=true)
private String OUTPUT_FILE = null;
public static final String INPUT_ROD_NAME = "inputvcf";
protected static BeagleFileReader gprobsReader = null;
protected static BeagleFileReader phasedReader = null;
protected static BeagleFileReader likeReader = null;
protected static BeagleFileReader r2Reader = null;
protected static String line = null;
protected HashMap<String,BeagleSampleRecord> beagleSampleRecords;
final TreeSet<String> samples = new TreeSet<String>();
private final ArrayList<String> ALLOWED_FORMAT_FIELDS = new ArrayList<String>();
private final double MIN_PROB_ERROR = 0.000001;
private final double MAX_GENOTYPE_QUALITY = 6.0;
public void initialize() {
// setup the header fields
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFInfoHeaderLine("R2", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "r2 Value reported by Beable on each site"));
hInfo.add(new VCFHeaderLine("source", "BeagleImputation"));
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
// Open output file specified by output VCF ROD
vcfWriter = new VCFWriter(new File(OUTPUT_FILE));
ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_KEY); // copied from VariantsToVCF
ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY);
ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.DEPTH_KEY);
ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_LIKELIHOODS_KEY);
for( final ReferenceOrderedDataSource source : dataSources ) {
final RMDTrack rod = source.getReferenceOrderedData();
if( rod.getRecordType().equals(VCFRecord.class) && rod.getName().equalsIgnoreCase(INPUT_ROD_NAME)) {
final VCFReader reader = new VCFReader(rod.getFile());
final Set<String> vcfSamples = reader.getHeader().getGenotypeSamples();
samples.addAll(vcfSamples);
reader.close();
}
}
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
vcfWriter.writeHeader(vcfHeader);
// Open Beagle files for processing
gprobsReader = new BeagleFileReader(INPUT_PREFIX.concat(".gprobs"));
likeReader = new BeagleFileReader(INPUT_PREFIX.concat(".like"));
phasedReader = new BeagleFileReader(INPUT_PREFIX.concat(".phased"));
r2Reader = new BeagleFileReader(INPUT_PREFIX.concat(".r2"));
if (!CheckAllSamplesAreInList(likeReader.getHeaderString().split(" "), samples, 3, 3))
throw new StingException("Inconsistent list of samples in File: " + likeReader.getFileName());
if (!CheckAllSamplesAreInList(gprobsReader.getHeaderString().split(" "), samples, 3, 3))
throw new StingException("Inconsistent list of samples in File: " + gprobsReader.getFileName());
if (!CheckAllSamplesAreInList(phasedReader.getHeaderString().split(" "), samples, 2, 2))
throw new StingException("Inconsistent list of samples in File: " + phasedReader.getFileName());
beagleSampleRecords = new HashMap<String,BeagleSampleRecord>();
// OK, now that we have all data in string array format inside readers, process each array individually,
for(String[] gprobLine: gprobsReader.getDataSet())
{
int j;
BeagleSampleRecord bglRecord = new BeagleSampleRecord();
LikelihoodTrios ltrio;
String markerKey = gprobLine[0];
String alleleA = gprobLine[1];
String alleleB = gprobLine[2];
HashMap<String,LikelihoodTrios> genotypeProbabilities = new HashMap<String,LikelihoodTrios>();
j = 3;
for (String sample : samples) {
ltrio = new LikelihoodTrios(new Double(gprobLine[j]), new Double(gprobLine[j+1]),
new Double(gprobLine[j+2]));
j = j+3;
genotypeProbabilities.put(sample,ltrio);
}
bglRecord.GenotypeProbabilities = genotypeProbabilities;
bglRecord.AlleleA = new Allele(alleleA);
bglRecord.AlleleB = new Allele(alleleB);
beagleSampleRecords.put(markerKey,bglRecord);
}
// now fill in input likelihood info in the same way
for (String[] likeLine: likeReader.getDataSet())
{
int j;
LikelihoodTrios ltrio;
String markerKey = likeLine[0];
String alleleA = likeLine[1];
String alleleB = likeLine[2];
HashMap<String,LikelihoodTrios> genotypeLikelihoods = new HashMap<String,LikelihoodTrios>();
j = 3;
for (String sample : samples) {
ltrio = new LikelihoodTrios(new Double(likeLine[j]), new Double(likeLine[j+1]),
new Double(likeLine[j+2]));
j = j+3;
genotypeLikelihoods.put(sample,ltrio);
}
BeagleSampleRecord bglRecord = beagleSampleRecords.get(markerKey);
bglRecord.InputLikelihoods = genotypeLikelihoods;
beagleSampleRecords.put(markerKey,bglRecord);
}
for (String[] phasedLine: phasedReader.getDataSet())
{
int j;
HaplotypePair pair;
String markerKey = phasedLine[1];
HashMap<String,HaplotypePair> haplotypePairs = new HashMap<String,HaplotypePair>();
j = 2;
for (String sample : samples) {
pair = new HaplotypePair(new Allele(phasedLine[j]), new Allele(phasedLine[j+1]));
j = j+2;
haplotypePairs.put(sample,pair);
}
BeagleSampleRecord bglRecord = beagleSampleRecords.get(markerKey);
bglRecord.PhasedHaplotypes = haplotypePairs;
beagleSampleRecords.put(markerKey,bglRecord);
}
for (String[] r2Line: r2Reader.getDataSet())
{
int j = 0;
String[] vals = r2Line[0].split("\t");
String markerKey = vals[0];
BeagleSampleRecord bglRecord = beagleSampleRecords.get(markerKey);
bglRecord.setR2Value(Double.valueOf(vals[1]));
beagleSampleRecords.put(markerKey,bglRecord);
}
}
private boolean CheckAllSamplesAreInList(String[] stringArray, TreeSet<String> samples, int valsPerSample, int initMarkers)
{
boolean allThere = true;
if (stringArray.length != valsPerSample*samples.size()+initMarkers) {
allThere = false;
}
for (int k=initMarkers; k < stringArray.length; k++)
if (!samples.contains(stringArray[k]))
allThere = false;
return allThere;
}
/*
private class MyFileReader {
private FileReader reader;
private String fileName;
public MyFileReader(String fileName) {
try{
reader = new FileReader(fileName);
} catch ( FileNotFoundException e) {
throw new StingException("Could not find required input file: " + fileName);
}
this.fileName = fileName;
}
public String GetNextLine() {
String line;
int x = reader.read();
line.
//reader.read()
}
} */
private class BeagleFileReader {
private String headerString;
private BufferedReader reader;
private String fileName;
private HashSet<String[]> dataSet;
public String getHeaderString() {
return headerString;
}
public BufferedReader getReader() {
return reader;
}
public String getFileName() {
return fileName;
}
public HashSet<String[]> getDataSet() {
return dataSet;
}
public BeagleFileReader(String fileName) {
String curLine;
try{
reader = new BufferedReader(new FileReader(fileName));
} catch ( FileNotFoundException e) {
throw new StingException("Could not find required input file: " + fileName);
}
this.fileName = fileName;
try {
headerString = reader.readLine();
dataSet = new HashSet<String[]>();
while (( curLine = reader.readLine()) != null){
// Split current line along white spaces, ignore multiple white spaces
String lineTokens[] = curLine.split(" +");
dataSet.add(lineTokens);
}
reader.close();
}
catch (IOException e) {
throw new StingException("Failed while reading data from input file: " + fileName);
}
}
}
private class LikelihoodTrios {
public double HomRefLikelihood;
public double HetLikelihood;
public double HomVarLikelihood;
public LikelihoodTrios(double hr, double het, double hv) {
this.HomRefLikelihood = hr;
this.HetLikelihood = het;
this.HomVarLikelihood = hv;
}
}
private class HaplotypePair {
public Allele AlleleA;
public Allele AlleleB;
public HaplotypePair(Allele a, Allele b) {
this.AlleleA = a;
this.AlleleB = b;
}
}
private class BeagleSampleRecord {
// class containing, for each marker, all information necessary for Beagle input/output
public Allele AlleleA;
public Allele AlleleB;
private double r2Value;
public HashMap<String,LikelihoodTrios> GenotypeProbabilities;
public HashMap<String,LikelihoodTrios> InputLikelihoods;
public HashMap<String,HaplotypePair> PhasedHaplotypes;
public Double getR2Value() {
return r2Value;
}
public void setR2Value(Double x) {
this.r2Value = x;
}
}
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
if ( tracker == null )
return 0;
EnumSet<VariantContext.Type> vc = EnumSet.of(VariantContext.Type.SNP);
GenomeLoc loc = context.getLocation();
VariantContext vc_input;
try {
vc_input = tracker.getVariantContext(ref,"inputvcf", vc, loc, true);
} catch (java.util.NoSuchElementException e) {
return 0;
}
if (beagleSampleRecords.containsKey(loc.toString())) {
// Get record associated with this marker from output Beagle data
BeagleSampleRecord bglRecord = beagleSampleRecords.get(loc.toString());
// get reference base for current position
byte refByte = ref.getBase();
// make new Genotypes based on Beagle results
Map<String, Genotype> genotypes;
genotypes = new HashMap<String, Genotype>(vc_input.getGenotypes().size());
// for each genotype, create a new object with Beagle information on it
for ( Map.Entry<String, Genotype> genotype : vc_input.getGenotypes().entrySet() ) {
Genotype g = genotype.getValue();
Set<String> filters = new LinkedHashSet<String>(g.getFilters());
// modify g here with Beagle info
boolean genotypeIsPhased = true;
String sample = g.getSampleName();
LikelihoodTrios gtprobs = bglRecord.GenotypeProbabilities.get(sample);
LikelihoodTrios inplike = bglRecord.InputLikelihoods.get(sample);
HaplotypePair hp = bglRecord.PhasedHaplotypes.get(sample);
// We have phased genotype in hp. Need to set the isRef field in the allele.
List<Allele> alleles = new ArrayList<Allele>();
byte r[] = hp.AlleleA.getBases();
byte rA = r[0];
//System.out.print((char)r[0]);
Boolean isRefA = (refByte == rA);
Allele refAllele = new Allele(r, isRefA );
alleles.add(refAllele);
r = hp.AlleleB.getBases();
byte rB = r[0];
//System.out.println((char)r[0]);
Boolean isRefB = (refByte == rB);
Allele altAllele = new Allele(r,isRefB);
alleles.add(altAllele);
// Compute new GQ field = -10*log10Pr(Genotype call is wrong)
// Beagle gives probability that genotype is AA, AB and BB.
// Which, by definition, are prob of hom ref, het and hom var.
Double probWrongGenotype, genotypeQuality;
if (isRefA && isRefB) // HomRef call
probWrongGenotype = gtprobs.HetLikelihood + gtprobs.HomVarLikelihood;
else if ((isRefB && !isRefA) || (isRefA && !isRefB))
probWrongGenotype = gtprobs.HomRefLikelihood + gtprobs.HomVarLikelihood;
else // HomVar call
probWrongGenotype = gtprobs.HetLikelihood + gtprobs.HomRefLikelihood;
if (probWrongGenotype < MIN_PROB_ERROR)
genotypeQuality = MAX_GENOTYPE_QUALITY;
else
genotypeQuality = -log10(probWrongGenotype);
Genotype imputedGenotype = new Genotype(genotype.getKey(), alleles, genotypeQuality, filters, g.getAttributes(), genotypeIsPhased);
genotypes.put(genotype.getKey(), imputedGenotype);
}
VariantContext filteredVC = new MutableVariantContext(vc_input.getName(), vc_input.getLocation(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.getFilters(), vc_input.getAttributes());
VCFRecord vcf = VariantContextAdaptors.toVCF(filteredVC, ref.getBase(), null, false, false);
vcf.addInfoField("R2", ((Double)bglRecord.getR2Value()).toString() );
vcfWriter.addRecord(vcf);
}
return 1;
}
public Integer reduceInit() {
return 0; // Nothing to do here
}
/**
* Increment the number of loci processed.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return the new number of loci processed.
*/
public Integer reduce(Integer value, Integer sum) {
return sum + value;
}
/**
* Tell the user the number of loci processed and close out the new variants file.
*
* @param result the number of loci seen.
*/
public void onTraversalDone(Integer result) {
out.printf("Processed %d loci.\n", result);
vcfWriter.close();
}
}

View File

@ -23,7 +23,7 @@
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.oneoffprojects.walkers;
package org.broadinstitute.sting.playground.gatk.walkers;
import org.broad.tribble.vcf.VCFRecord;
import org.broad.tribble.vcf.VCFGenotypeRecord;
@ -47,6 +47,7 @@ import java.util.*;
/**
* Produces an input file to Beagle imputation engine, listing genotype likelihoods for each sample in input VCF file
* @help.summary Produces an input file to Beagle imputation engine, listing genotype likelihoods for each sample in input VCF file
*/
@Requires(value={},referenceMetaData=@RMD(name="vcf",type= VCFRecord.class))
public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
@ -61,7 +62,7 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
for ( final ReferenceOrderedDataSource source : dataSources ) {
final RMDTrack rod = source.getReferenceOrderedData();
if ( rod.getType().equals(VCFRecord.class) ) {
if ( rod.getRecordType().equals(VCFRecord.class) ) {
final VCFReader reader = new VCFReader(rod.getFile());
final Set<String> vcfSamples = reader.getHeader().getGenotypeSamples();
samples.addAll(vcfSamples);
@ -89,6 +90,9 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
return 0;
}
// Get Reference base for this site: will be output to screen, not directly used by Beagle but rather by output analysis tools
// char re = (char)ref.getBase();
// output marker ID to Beagle input file
beagleWriter.print(String.format("%s ",vc_eval.getLocation().toString()));
@ -114,12 +118,12 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
String[] glArray = genotype.getAttributeAsString(VCFGenotypeRecord.GENOTYPE_LIKELIHOODS_KEY).split(",");
for (String gl : glArray) {
Double d_gl = Math.pow(10, Double.valueOf(gl));
Double d_gl = 100*Math.pow(10, Double.valueOf(gl));
beagleWriter.print(String.format("%5.2f ",d_gl));
}
}
else
beagleWriter.print("0 0 0 "); // write 0 likelihoods for uncalled genotypes.
beagleWriter.print("0.33 0.33 0.33 "); // write 1/3 likelihoods for uncalled genotypes.
}