diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java new file mode 100755 index 000000000..cc4161aee --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java @@ -0,0 +1,483 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers; + +import org.broad.tribble.vcf.*; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.*; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.RMD; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.genotype.vcf.VCFReader; +import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils; +import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter; + +import java.io.*; +import java.util.*; +import static java.lang.Math.log10; +import static java.lang.Math.pow; + + +/** + * Takes files produced by Beagle imputation engine and creates a vcf with modified annotations. + */ +@Requires(value={},referenceMetaData=@RMD(name=BeagleOutputToVCFWalker.INPUT_ROD_NAME,type= VCFRecord.class)) +public class BeagleOutputToVCFWalker extends RodWalker { + + private VCFWriter vcfWriter; + + @Argument(fullName="input_prefix", shortName="input", doc="The prefix added to input Beagle files gprobs, r2, ...", required=true) + private String INPUT_PREFIX = "beagle"; + + @Argument(fullName="output_file", shortName="output", doc="VCF file to which output should be written", required=true) + private String OUTPUT_FILE = null; + + public static final String INPUT_ROD_NAME = "inputvcf"; + + protected static BeagleFileReader gprobsReader = null; + protected static BeagleFileReader phasedReader = null; + protected static BeagleFileReader likeReader = null; + protected static BeagleFileReader r2Reader = null; + + protected static String line = null; + + protected HashMap beagleSampleRecords; + final TreeSet samples = new TreeSet(); + private final ArrayList ALLOWED_FORMAT_FIELDS = new ArrayList(); + + private final double MIN_PROB_ERROR = 0.000001; + private final double MAX_GENOTYPE_QUALITY = 6.0; + + public void initialize() { + // setup the header fields + + final Set hInfo = new HashSet(); + hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); + hInfo.add(new VCFInfoHeaderLine("R2", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "r2 Value reported by Beable on each site")); + hInfo.add(new VCFHeaderLine("source", "BeagleImputation")); + + final List dataSources = this.getToolkit().getRodDataSources(); + + // Open output file specified by output VCF ROD + vcfWriter = new VCFWriter(new File(OUTPUT_FILE)); + + ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_KEY); // copied from VariantsToVCF + ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY); + ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.DEPTH_KEY); + ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_LIKELIHOODS_KEY); + + for( final ReferenceOrderedDataSource source : dataSources ) { + final RMDTrack rod = source.getReferenceOrderedData(); + if( rod.getRecordType().equals(VCFRecord.class) && rod.getName().equalsIgnoreCase(INPUT_ROD_NAME)) { + final VCFReader reader = new VCFReader(rod.getFile()); + final Set vcfSamples = reader.getHeader().getGenotypeSamples(); + samples.addAll(vcfSamples); + reader.close(); + } + + } + final VCFHeader vcfHeader = new VCFHeader(hInfo, samples); + vcfWriter.writeHeader(vcfHeader); + + // Open Beagle files for processing + gprobsReader = new BeagleFileReader(INPUT_PREFIX.concat(".gprobs")); + likeReader = new BeagleFileReader(INPUT_PREFIX.concat(".like")); + phasedReader = new BeagleFileReader(INPUT_PREFIX.concat(".phased")); + r2Reader = new BeagleFileReader(INPUT_PREFIX.concat(".r2")); + + if (!CheckAllSamplesAreInList(likeReader.getHeaderString().split(" "), samples, 3, 3)) + throw new StingException("Inconsistent list of samples in File: " + likeReader.getFileName()); + + if (!CheckAllSamplesAreInList(gprobsReader.getHeaderString().split(" "), samples, 3, 3)) + throw new StingException("Inconsistent list of samples in File: " + gprobsReader.getFileName()); + + if (!CheckAllSamplesAreInList(phasedReader.getHeaderString().split(" "), samples, 2, 2)) + throw new StingException("Inconsistent list of samples in File: " + phasedReader.getFileName()); + + + beagleSampleRecords = new HashMap(); + + // OK, now that we have all data in string array format inside readers, process each array individually, + for(String[] gprobLine: gprobsReader.getDataSet()) + { + int j; + BeagleSampleRecord bglRecord = new BeagleSampleRecord(); + LikelihoodTrios ltrio; + + String markerKey = gprobLine[0]; + String alleleA = gprobLine[1]; + String alleleB = gprobLine[2]; + + HashMap genotypeProbabilities = new HashMap(); + + j = 3; + for (String sample : samples) { + ltrio = new LikelihoodTrios(new Double(gprobLine[j]), new Double(gprobLine[j+1]), + new Double(gprobLine[j+2])); + j = j+3; + genotypeProbabilities.put(sample,ltrio); + } + + bglRecord.GenotypeProbabilities = genotypeProbabilities; + bglRecord.AlleleA = new Allele(alleleA); + bglRecord.AlleleB = new Allele(alleleB); + + beagleSampleRecords.put(markerKey,bglRecord); + } + + // now fill in input likelihood info in the same way + for (String[] likeLine: likeReader.getDataSet()) + { + int j; + + LikelihoodTrios ltrio; + + String markerKey = likeLine[0]; + String alleleA = likeLine[1]; + String alleleB = likeLine[2]; + + HashMap genotypeLikelihoods = new HashMap(); + + j = 3; + for (String sample : samples) { + ltrio = new LikelihoodTrios(new Double(likeLine[j]), new Double(likeLine[j+1]), + new Double(likeLine[j+2])); + j = j+3; + genotypeLikelihoods.put(sample,ltrio); + } + + BeagleSampleRecord bglRecord = beagleSampleRecords.get(markerKey); + bglRecord.InputLikelihoods = genotypeLikelihoods; + beagleSampleRecords.put(markerKey,bglRecord); + + } + + for (String[] phasedLine: phasedReader.getDataSet()) + { + int j; + + HaplotypePair pair; + + String markerKey = phasedLine[1]; + + HashMap haplotypePairs = new HashMap(); + + j = 2; + for (String sample : samples) { + pair = new HaplotypePair(new Allele(phasedLine[j]), new Allele(phasedLine[j+1])); + j = j+2; + haplotypePairs.put(sample,pair); + } + + BeagleSampleRecord bglRecord = beagleSampleRecords.get(markerKey); + bglRecord.PhasedHaplotypes = haplotypePairs; + beagleSampleRecords.put(markerKey,bglRecord); + + } + + for (String[] r2Line: r2Reader.getDataSet()) + { + int j = 0; + + String[] vals = r2Line[0].split("\t"); + String markerKey = vals[0]; + + BeagleSampleRecord bglRecord = beagleSampleRecords.get(markerKey); + bglRecord.setR2Value(Double.valueOf(vals[1])); + beagleSampleRecords.put(markerKey,bglRecord); + + } + } + + private boolean CheckAllSamplesAreInList(String[] stringArray, TreeSet samples, int valsPerSample, int initMarkers) + { + boolean allThere = true; + + if (stringArray.length != valsPerSample*samples.size()+initMarkers) { + allThere = false; + } + + for (int k=initMarkers; k < stringArray.length; k++) + if (!samples.contains(stringArray[k])) + allThere = false; + + return allThere; + + } + /* + + + private class MyFileReader { + private FileReader reader; + private String fileName; + public MyFileReader(String fileName) { + try{ + reader = new FileReader(fileName); + } catch ( FileNotFoundException e) { + throw new StingException("Could not find required input file: " + fileName); + } + + this.fileName = fileName; + + } + public String GetNextLine() { + String line; + int x = reader.read(); + line. + //reader.read() + } + } */ + + private class BeagleFileReader { + private String headerString; + private BufferedReader reader; + private String fileName; + private HashSet dataSet; + + public String getHeaderString() { + return headerString; + } + public BufferedReader getReader() { + return reader; + } + public String getFileName() { + return fileName; + } + public HashSet getDataSet() { + return dataSet; + } + + public BeagleFileReader(String fileName) { + String curLine; + + try{ + reader = new BufferedReader(new FileReader(fileName)); + } catch ( FileNotFoundException e) { + throw new StingException("Could not find required input file: " + fileName); + } + + this.fileName = fileName; + + try { + headerString = reader.readLine(); + + dataSet = new HashSet(); + while (( curLine = reader.readLine()) != null){ + // Split current line along white spaces, ignore multiple white spaces + String lineTokens[] = curLine.split(" +"); + + dataSet.add(lineTokens); + } + reader.close(); + } + catch (IOException e) { + throw new StingException("Failed while reading data from input file: " + fileName); + } + + } + + } + + private class LikelihoodTrios { + public double HomRefLikelihood; + public double HetLikelihood; + public double HomVarLikelihood; + + public LikelihoodTrios(double hr, double het, double hv) { + this.HomRefLikelihood = hr; + this.HetLikelihood = het; + this.HomVarLikelihood = hv; + } + + } + + private class HaplotypePair { + public Allele AlleleA; + public Allele AlleleB; + + public HaplotypePair(Allele a, Allele b) { + this.AlleleA = a; + this.AlleleB = b; + } + } + + private class BeagleSampleRecord { + // class containing, for each marker, all information necessary for Beagle input/output + public Allele AlleleA; + public Allele AlleleB; + private double r2Value; + public HashMap GenotypeProbabilities; + public HashMap InputLikelihoods; + public HashMap PhasedHaplotypes; + + public Double getR2Value() { + return r2Value; + } + + public void setR2Value(Double x) { + this.r2Value = x; + } + } + + + public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { + + if ( tracker == null ) + return 0; + + EnumSet vc = EnumSet.of(VariantContext.Type.SNP); + GenomeLoc loc = context.getLocation(); + VariantContext vc_input; + + + try { + vc_input = tracker.getVariantContext(ref,"inputvcf", vc, loc, true); + } catch (java.util.NoSuchElementException e) { + return 0; + } + + + + if (beagleSampleRecords.containsKey(loc.toString())) { + // Get record associated with this marker from output Beagle data + BeagleSampleRecord bglRecord = beagleSampleRecords.get(loc.toString()); + + // get reference base for current position + byte refByte = ref.getBase(); + + // make new Genotypes based on Beagle results + Map genotypes; + + genotypes = new HashMap(vc_input.getGenotypes().size()); + + // for each genotype, create a new object with Beagle information on it + for ( Map.Entry genotype : vc_input.getGenotypes().entrySet() ) { + + Genotype g = genotype.getValue(); + Set filters = new LinkedHashSet(g.getFilters()); + + // modify g here with Beagle info + boolean genotypeIsPhased = true; + String sample = g.getSampleName(); + + LikelihoodTrios gtprobs = bglRecord.GenotypeProbabilities.get(sample); + LikelihoodTrios inplike = bglRecord.InputLikelihoods.get(sample); + HaplotypePair hp = bglRecord.PhasedHaplotypes.get(sample); + + // We have phased genotype in hp. Need to set the isRef field in the allele. + List alleles = new ArrayList(); + + byte r[] = hp.AlleleA.getBases(); + byte rA = r[0]; +//System.out.print((char)r[0]); + + Boolean isRefA = (refByte == rA); + + Allele refAllele = new Allele(r, isRefA ); + alleles.add(refAllele); + + r = hp.AlleleB.getBases(); + byte rB = r[0]; +//System.out.println((char)r[0]); + + Boolean isRefB = (refByte == rB); + Allele altAllele = new Allele(r,isRefB); + alleles.add(altAllele); + + + + // Compute new GQ field = -10*log10Pr(Genotype call is wrong) + // Beagle gives probability that genotype is AA, AB and BB. + // Which, by definition, are prob of hom ref, het and hom var. + Double probWrongGenotype, genotypeQuality; + + if (isRefA && isRefB) // HomRef call + probWrongGenotype = gtprobs.HetLikelihood + gtprobs.HomVarLikelihood; + else if ((isRefB && !isRefA) || (isRefA && !isRefB)) + probWrongGenotype = gtprobs.HomRefLikelihood + gtprobs.HomVarLikelihood; + else // HomVar call + probWrongGenotype = gtprobs.HetLikelihood + gtprobs.HomRefLikelihood; + + + if (probWrongGenotype < MIN_PROB_ERROR) + genotypeQuality = MAX_GENOTYPE_QUALITY; + else + genotypeQuality = -log10(probWrongGenotype); + + Genotype imputedGenotype = new Genotype(genotype.getKey(), alleles, genotypeQuality, filters, g.getAttributes(), genotypeIsPhased); + + genotypes.put(genotype.getKey(), imputedGenotype); + } + + + + + VariantContext filteredVC = new MutableVariantContext(vc_input.getName(), vc_input.getLocation(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.getFilters(), vc_input.getAttributes()); + + VCFRecord vcf = VariantContextAdaptors.toVCF(filteredVC, ref.getBase(), null, false, false); + + vcf.addInfoField("R2", ((Double)bglRecord.getR2Value()).toString() ); + vcfWriter.addRecord(vcf); + } + + return 1; + + } + + public Integer reduceInit() { + return 0; // Nothing to do here + } + + /** + * Increment the number of loci processed. + * + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return the new number of loci processed. + */ + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + /** + * Tell the user the number of loci processed and close out the new variants file. + * + * @param result the number of loci seen. + */ + public void onTraversalDone(Integer result) { + out.printf("Processed %d loci.\n", result); + + vcfWriter.close(); + } + } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/ProduceBeagleInputWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ProduceBeagleInputWalker.java similarity index 90% rename from java/src/org/broadinstitute/sting/oneoffprojects/walkers/ProduceBeagleInputWalker.java rename to java/src/org/broadinstitute/sting/playground/gatk/walkers/ProduceBeagleInputWalker.java index 6605793d4..4f13fa6c5 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/ProduceBeagleInputWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ProduceBeagleInputWalker.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.oneoffprojects.walkers; +package org.broadinstitute.sting.playground.gatk.walkers; import org.broad.tribble.vcf.VCFRecord; import org.broad.tribble.vcf.VCFGenotypeRecord; @@ -47,6 +47,7 @@ import java.util.*; /** * Produces an input file to Beagle imputation engine, listing genotype likelihoods for each sample in input VCF file + * @help.summary Produces an input file to Beagle imputation engine, listing genotype likelihoods for each sample in input VCF file */ @Requires(value={},referenceMetaData=@RMD(name="vcf",type= VCFRecord.class)) public class ProduceBeagleInputWalker extends RodWalker { @@ -61,7 +62,7 @@ public class ProduceBeagleInputWalker extends RodWalker { final List dataSources = this.getToolkit().getRodDataSources(); for ( final ReferenceOrderedDataSource source : dataSources ) { final RMDTrack rod = source.getReferenceOrderedData(); - if ( rod.getType().equals(VCFRecord.class) ) { + if ( rod.getRecordType().equals(VCFRecord.class) ) { final VCFReader reader = new VCFReader(rod.getFile()); final Set vcfSamples = reader.getHeader().getGenotypeSamples(); samples.addAll(vcfSamples); @@ -89,6 +90,9 @@ public class ProduceBeagleInputWalker extends RodWalker { return 0; } + // Get Reference base for this site: will be output to screen, not directly used by Beagle but rather by output analysis tools + // char re = (char)ref.getBase(); + // output marker ID to Beagle input file beagleWriter.print(String.format("%s ",vc_eval.getLocation().toString())); @@ -114,12 +118,12 @@ public class ProduceBeagleInputWalker extends RodWalker { String[] glArray = genotype.getAttributeAsString(VCFGenotypeRecord.GENOTYPE_LIKELIHOODS_KEY).split(","); for (String gl : glArray) { - Double d_gl = Math.pow(10, Double.valueOf(gl)); + Double d_gl = 100*Math.pow(10, Double.valueOf(gl)); beagleWriter.print(String.format("%5.2f ",d_gl)); } } else - beagleWriter.print("0 0 0 "); // write 0 likelihoods for uncalled genotypes. + beagleWriter.print("0.33 0.33 0.33 "); // write 1/3 likelihoods for uncalled genotypes. }