diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/BeagleROD.java b/java/src/org/broadinstitute/sting/gatk/refdata/BeagleROD.java deleted file mode 100755 index e1bd47f00..000000000 --- a/java/src/org/broadinstitute/sting/gatk/refdata/BeagleROD.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.refdata; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.text.XReadLines; - -import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.io.IOException; -import java.io.File; -import java.io.FileNotFoundException; - -public class BeagleROD extends BasicReferenceOrderedDatum { - GenomeLoc loc; - List sampleNames = null; - Map> sampleGenotypes = new HashMap>(); - - public BeagleROD(String name) { - super(name); - } - - public String toString() { return "BeagleRod"; } - - public String delimiterRegex() { - return " "; - } - - public GenomeLoc getLocation() { - return loc; - } - - public List getSampleNames() { - return sampleNames; - } - - public Map> getGenotypes() { - return sampleGenotypes; - } - - public Object initialize(final File source) throws FileNotFoundException { - String firstLine = new XReadLines(source).next(); - String[] parts = firstLine.split(" "); - if ( parts[0].equals("I") ) { - // I id NA12891 NA12891 NA12892 NA12892 - sampleNames = Arrays.asList(parts).subList(2, parts.length); - return sampleNames; - } else { - throw new IllegalStateException("Beagle file " + source + " doesn't have required header line I"); - } - } - - private static Pattern MARKER_PATTERN = Pattern.compile("c(.+)_p([0-9]+)"); - - public static GenomeLoc parseMarkerName(String markerName) { - Matcher m = MARKER_PATTERN.matcher(markerName); - if ( m.matches() ) { - String contig = m.group(1); - long start = Long.valueOf(m.group(2)); - return GenomeLocParser.createGenomeLoc(contig, start, start); - } else { - throw new IllegalArgumentException("Malformatted family structure string: " + markerName + " required format is mom+dad=child"); - } - } - - public boolean parseLine(final Object header, final String[] parts) throws IOException { - //System.out.printf("Parsing beagle parts=%s header=%s%n", parts, header); - List sampleNames = (List)header; - - if ( parts.length == 0 || ! parts[0].equals("M") ) - return false; - else { - loc = parseMarkerName(parts[1]); - - for ( int i = 2; i < parts.length; i++ ) { - String sampleName = sampleNames.get(i-2); - if ( ! sampleGenotypes.containsKey(sampleName) ) { - sampleGenotypes.put(sampleName, new ArrayList()); - } - - sampleGenotypes.get(sampleName).add(parts[i]); - } - - return true; - } - } -} diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleCodec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleCodec.java new file mode 100755 index 000000000..3183b5cde --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleCodec.java @@ -0,0 +1,233 @@ +package org.broadinstitute.sting.gatk.refdata.features.beagle; + +import org.broad.tribble.FeatureCodec; +import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableFeature; +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.broad.tribble.FeatureCodec; +import org.broad.tribble.exception.CodecLineParsingException; +import org.broad.tribble.util.AsciiLineReader; +import org.broad.tribble.util.LineReader; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; + +public class BeagleCodec implements FeatureCodec { + private String[] header; + public enum BeagleReaderType {PROBLIKELIHOOD, GENOTYPES, R2}; + private BeagleReaderType readerType; + private int valuesPerSample; + private int initialSampleIndex; + private int markerPosition; + private ArrayList sampleNames; + private int expectedTokensPerLine; + + private static final String delimiterRegex = "\\s+"; + + public static String[] readHeader(final File source) throws IOException { + FileInputStream is = new FileInputStream(source); + try { + return readHeader(new AsciiLineReader(is), null); + } finally { + is.close(); + } + } + + public int readHeader(LineReader reader) + { + int[] lineCounter = new int[1]; + try { + header = readHeader(reader, lineCounter); + + Boolean getSamples = true; + markerPosition = 0; //default value for all readers + + if (header[0].matches("I")) { + // Phased genotype Beagle files start with "I" + readerType = BeagleReaderType.GENOTYPES; + valuesPerSample = 2; + initialSampleIndex = 2; + markerPosition = 1; + } + else if (header[0].matches("marker")) { + readerType = BeagleReaderType.PROBLIKELIHOOD; + valuesPerSample = 3; + initialSampleIndex = 3; + } + else { + readerType = BeagleReaderType.R2; + getSamples = false; + // signal we don't have a header + lineCounter[0] = 0; + // not needed, but for consistency: + valuesPerSample = 0; + initialSampleIndex = 0; + } + + sampleNames = new ArrayList(); + + if (getSamples) { + for (int k = initialSampleIndex; k < header.length; k += valuesPerSample) + sampleNames.add(header[k]); + + expectedTokensPerLine = sampleNames.size()*valuesPerSample+initialSampleIndex; + + } else { + expectedTokensPerLine = 2; + } + + + } catch(IOException e) { + throw new IllegalArgumentException("Unable to read from file.", e); + } + return lineCounter[0]; + } + + private static String[] readHeader(final LineReader source, int[] lineCounter) throws IOException { + + String[] header = null; + int numLines = 0; + + //find the 1st line that's non-empty and not a comment + String line; + while( (line = source.readLine()) != null ) { + numLines++; + if ( line.trim().isEmpty() ) { + continue; + } + + //parse the header + header = line.split(delimiterRegex); + break; + } + + // check that we found the header + if ( header == null ) { + throw new IllegalArgumentException("No header in " + source); + } + + if(lineCounter != null) { + lineCounter[0] = numLines; + } + + return header; + } + + private static Pattern MARKER_PATTERN = Pattern.compile("(.+):([0-9]+)"); + + private static GenomeLoc parseMarkerName(String markerName) { + Matcher m = MARKER_PATTERN.matcher(markerName); + if ( m.matches() ) { + String contig = m.group(1); + long start = Long.valueOf(m.group(2)); + return GenomeLocParser.createGenomeLoc(contig, start, start); + } else { + throw new IllegalArgumentException("Malformatted marker string: " + markerName + " required format is chrN:position"); + } + } + + @Override + public Class getFeatureType() { + return BeagleFeature.class; + } + + @Override + public HeaderType getHeader(Class clazz) throws ClassCastException { + return null; // we haven't stored the header + } + + public BeagleFeature decode(String line) { + String[] tokens; + + // split the line + tokens = line.split(delimiterRegex); + if (tokens.length != expectedTokensPerLine) + throw new CodecLineParsingException("Incorrect number of fields in Beagle input on line "+line); + + + + BeagleFeature bglFeature = new BeagleFeature(); + + final GenomeLoc loc = GenomeLocParser.parseGenomeLoc(tokens[markerPosition]); //GenomeLocParser.parseGenomeInterval(values.get(0)); - TODO switch to this + + //parse the location: common to all readers + bglFeature.setChr(loc.getContig()); + bglFeature.setStart((int) loc.getStart()); + bglFeature.setEnd((int) loc.getStop()); + + // Parse R2 if needed + if (readerType == BeagleReaderType.R2) { + bglFeature.setR2value(Double.valueOf(tokens[1])); + } + else if (readerType == BeagleReaderType.GENOTYPES) { + // read phased Genotype pairs + HashMap> sampleGenotypes = new HashMap>(); + + for ( int i = 2; i < tokens.length; i+=2 ) { + String sampleName = sampleNames.get(i/2-1); + if ( ! sampleGenotypes.containsKey(sampleName) ) { + sampleGenotypes.put(sampleName, new ArrayList()); + } + + sampleGenotypes.get(sampleName).add(tokens[i]); + sampleGenotypes.get(sampleName).add(tokens[i+1]); + } + + bglFeature.setGenotypes(sampleGenotypes); + } + else { + // read probabilities/likelihood trios and alleles + bglFeature.setAlleleA(tokens[1], true); + bglFeature.setAlleleB(tokens[2], false); + HashMap> sampleProbLikelihoods = new HashMap>(); + + for ( int i = 3; i < tokens.length; i+=3 ) { + String sampleName = sampleNames.get(i/3-1); + if ( ! sampleProbLikelihoods.containsKey(sampleName) ) { + sampleProbLikelihoods.put(sampleName, new ArrayList()); + } + + sampleProbLikelihoods.get(sampleName).add(tokens[i]); + sampleProbLikelihoods.get(sampleName).add(tokens[i+1]); + sampleProbLikelihoods.get(sampleName).add(tokens[i+2]); + } + bglFeature.setProbLikelihoods(sampleProbLikelihoods); + } + + return bglFeature; + } + +} diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleFeature.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleFeature.java new file mode 100755 index 000000000..ee1dc9154 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleFeature.java @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.refdata.features.beagle; + +import org.broad.tribble.Feature; + + +import org.broad.tribble.Feature; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import net.sf.samtools.util.StringUtil; +import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; +import org.broadinstitute.sting.utils.GenomeLoc; + +public class BeagleFeature implements Feature { + + private String chr; + private int start; + private int end; + + Map> sampleGenotypes; + private Double r2Value; + Map> probLikelihoods; + + Allele AlleleA; + Allele AlleleB; + + + public String getChr() { + return chr; + } + + public int getStart() { + return start; + } + + public int getEnd() { + return end; + } + + public Double getR2value() { + return r2Value; + } + + public Allele getAlleleA() { + return AlleleA; + } + + public Allele getAlleleB() { + return AlleleB; + } + + public Map> getProbLikelihoods() { + return probLikelihoods; + } + + public Map> getGenotypes() { + return sampleGenotypes; + } + + protected void setChr(String chr) { + this.chr = chr; + } + + protected void setStart(int start) { + this.start = start; + } + + protected void setEnd(int end) { + this.end = end; + } + + protected void setR2value(double r2) { + this.r2Value = r2; + } + + protected void setAlleleA(String a, boolean isRef) { + this.AlleleA = Allele.create(a, isRef); + } + + protected void setAlleleB(String a, boolean isRef) { + this.AlleleB = Allele.create(a, isRef); + } + + protected void setProbLikelihoods(Map> p) { + this.probLikelihoods = p; + } + + protected void setGenotypes(Map> p) { + this.sampleGenotypes = p; + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java index c01fe1537..1d6ccfe7d 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java @@ -31,9 +31,12 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.*; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.refdata.features.beagle.BeagleFeature; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; +import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.RMD; import org.broadinstitute.sting.gatk.walkers.Requires; @@ -52,6 +55,8 @@ import static java.lang.Math.log10; * Takes files produced by Beagle imputation engine and creates a vcf with modified annotations. */ @Requires(value={},referenceMetaData=@RMD(name=BeagleOutputToVCFWalker.INPUT_ROD_NAME,type= VCFRecord.class)) +//@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="r2",type= ReferenceOrderedDatum.class), @RMD(name="beagleR2",type= BeagleR2ROD.class)}) + public class BeagleOutputToVCFWalker extends RodWalker { private VCFWriter vcfWriter; @@ -65,14 +70,16 @@ public class BeagleOutputToVCFWalker extends RodWalker { public static final String INPUT_ROD_NAME = "inputvcf"; +/* protected static BeagleFileReader gprobsReader = null; + protected static BeagleFileReader phasedReader = null; protected static BeagleFileReader likeReader = null; protected static BeagleFileReader r2Reader = null; - + */ protected static String line = null; - protected HashMap beagleSampleRecords; + // protected HashMap beagleSampleRecords; final TreeSet samples = new TreeSet(); @@ -107,336 +114,148 @@ public class BeagleOutputToVCFWalker extends RodWalker { final VCFHeader vcfHeader = new VCFHeader(hInfo, samples); vcfWriter.writeHeader(vcfHeader); - // Open Beagle files for processing - gprobsReader = new BeagleFileReader(INPUT_PREFIX.concat(".gprobs")); - likeReader = new BeagleFileReader(INPUT_PREFIX.concat(".like")); - phasedReader = new BeagleFileReader(INPUT_PREFIX.concat(".phased")); - r2Reader = new BeagleFileReader(INPUT_PREFIX.concat(".r2")); - - if (!CheckAllSamplesAreInList(likeReader.getHeaderString().split(" "), samples, 3, 3)) - throw new StingException("Inconsistent list of samples in File: " + likeReader.getFileName()); - - if (!CheckAllSamplesAreInList(gprobsReader.getHeaderString().split(" "), samples, 3, 3)) - throw new StingException("Inconsistent list of samples in File: " + gprobsReader.getFileName()); - - if (!CheckAllSamplesAreInList(phasedReader.getHeaderString().split(" "), samples, 2, 2)) - throw new StingException("Inconsistent list of samples in File: " + phasedReader.getFileName()); - - - beagleSampleRecords = new HashMap(); - - // OK, now that we have all data in string array format inside readers, process each array individually, - for(String[] gprobLine: gprobsReader.getDataSet()) - { - int j; - BeagleSampleRecord bglRecord = new BeagleSampleRecord(); - LikelihoodTrios ltrio; - - String markerKey = gprobLine[0]; - String alleleA = gprobLine[1]; - String alleleB = gprobLine[2]; - - HashMap genotypeProbabilities = new HashMap(); - - j = 3; - for (String sample : samples) { - ltrio = new LikelihoodTrios(new Double(gprobLine[j]), new Double(gprobLine[j+1]), - new Double(gprobLine[j+2])); - j = j+3; - genotypeProbabilities.put(sample,ltrio); - } - - bglRecord.GenotypeProbabilities = genotypeProbabilities; - bglRecord.AlleleA = Allele.create(alleleA); - bglRecord.AlleleB = Allele.create(alleleB); - - beagleSampleRecords.put(markerKey,bglRecord); - } - - // now fill in input likelihood info in the same way - for (String[] likeLine: likeReader.getDataSet()) - { - int j; - - LikelihoodTrios ltrio; - - String markerKey = likeLine[0]; - - HashMap genotypeLikelihoods = new HashMap(); - - j = 3; - for (String sample : samples) { - ltrio = new LikelihoodTrios(new Double(likeLine[j]), new Double(likeLine[j+1]), - new Double(likeLine[j+2])); - j = j+3; - genotypeLikelihoods.put(sample,ltrio); - } - - BeagleSampleRecord bglRecord = beagleSampleRecords.get(markerKey); - bglRecord.InputLikelihoods = genotypeLikelihoods; - beagleSampleRecords.put(markerKey,bglRecord); - - } - - for (String[] phasedLine: phasedReader.getDataSet()) - { - int j; - - HaplotypePair pair; - - String markerKey = phasedLine[1]; - - HashMap haplotypePairs = new HashMap(); - - j = 2; - for (String sample : samples) { - pair = new HaplotypePair(Allele.create(phasedLine[j]), Allele.create(phasedLine[j+1])); - j = j+2; - haplotypePairs.put(sample,pair); - } - - BeagleSampleRecord bglRecord = beagleSampleRecords.get(markerKey); - bglRecord.PhasedHaplotypes = haplotypePairs; - beagleSampleRecords.put(markerKey,bglRecord); - - } - - for (String[] r2Line: r2Reader.getDataSet()) - { - - String[] vals = r2Line[0].split("\t"); - String markerKey = vals[0]; - - BeagleSampleRecord bglRecord = beagleSampleRecords.get(markerKey); - bglRecord.setR2Value(Double.valueOf(vals[1])); - beagleSampleRecords.put(markerKey,bglRecord); - - } - } - - private boolean CheckAllSamplesAreInList(String[] stringArray, TreeSet samples, int valsPerSample, int initMarkers) - { - boolean allThere = true; - - if (stringArray.length != valsPerSample*samples.size()+initMarkers) { - allThere = false; - } - - for (int k=initMarkers; k < stringArray.length; k++) - if (!samples.contains(stringArray[k])) - allThere = false; - - return allThere; - } - private class BeagleFileReader { - private String headerString; - private BufferedReader reader; - private String fileName; - private HashSet dataSet; - - public String getHeaderString() { - return headerString; - } - public BufferedReader getReader() { - return reader; - } - public String getFileName() { - return fileName; - } - public HashSet getDataSet() { - return dataSet; - } - - public BeagleFileReader(String fileName) { - String curLine; - - try{ - reader = new BufferedReader(new FileReader(fileName)); - } catch ( FileNotFoundException e) { - throw new StingException("Could not find required input file: " + fileName); - } - - this.fileName = fileName; - - try { - headerString = reader.readLine(); - - dataSet = new HashSet(); - while (( curLine = reader.readLine()) != null){ - // Split current line along white spaces, ignore multiple white spaces - String lineTokens[] = curLine.split(" +"); - - dataSet.add(lineTokens); - } - reader.close(); - } - catch (IOException e) { - throw new StingException("Failed while reading data from input file: " + fileName); - } - - } - - } - - private class LikelihoodTrios { - public double HomRefLikelihood; - public double HetLikelihood; - public double HomVarLikelihood; - - public LikelihoodTrios(double hr, double het, double hv) { - this.HomRefLikelihood = hr; - this.HetLikelihood = het; - this.HomVarLikelihood = hv; - } - - } - - private class HaplotypePair { - public Allele AlleleA; - public Allele AlleleB; - - public HaplotypePair(Allele a, Allele b) { - this.AlleleA = a; - this.AlleleB = b; - } - } - - private class BeagleSampleRecord { - // class containing, for each marker, all information necessary for Beagle input/output - public Allele AlleleA; - public Allele AlleleB; - private double r2Value; - public HashMap GenotypeProbabilities; - public HashMap InputLikelihoods; - public HashMap PhasedHaplotypes; - - public Double getR2Value() { - return r2Value; - } - - public void setR2Value(Double x) { - this.r2Value = x; - } - } - - - public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { + public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { if ( tracker == null ) return 0; - EnumSet vc = EnumSet.of(VariantContext.Type.SNP); GenomeLoc loc = context.getLocation(); - VariantContext vc_input; - - - try { - vc_input = tracker.getVariantContext(ref,"inputvcf", vc, loc, true); - } catch (java.util.NoSuchElementException e) { + VariantContext vc_input = tracker.getVariantContext(ref,"inputvcf", null, loc, false); + if ( vc_input == null ) return 0; + + + List likerods = tracker.getReferenceMetaData("beagleLike"); + // ignore places where we don't have a variant + if ( likerods.size() == 0 ) + return 0; + + BeagleFeature beagleLikeFeature = (BeagleFeature)likerods.get(0); + + List r2rods = tracker.getReferenceMetaData("beagleR2"); + + // ignore places where we don't have a variant + if ( r2rods.size() == 0 ) + return 0; + + BeagleFeature beagleR2Feature = (BeagleFeature)r2rods.get(0); + + List gProbsrods = tracker.getReferenceMetaData("beagleProbs"); + + // ignore places where we don't have a variant + if ( gProbsrods.size() == 0 ) + return 0; + + BeagleFeature beagleProbsFeature = (BeagleFeature)gProbsrods.get(0); + + List gPhasedrods = tracker.getReferenceMetaData("beaglePhased"); + + // ignore places where we don't have a variant + if ( gPhasedrods.size() == 0 ) + return 0; + + BeagleFeature beaglePhasedFeature = (BeagleFeature)gPhasedrods.get(0); + + // get reference base for current position + byte refByte = ref.getBase(); + + // make new Genotypes based on Beagle results + Map genotypes = new HashMap(vc_input.getGenotypes().size()); + + + // for each genotype, create a new object with Beagle information on it + for ( Map.Entry genotype : vc_input.getGenotypes().entrySet() ) { + + + Genotype g = genotype.getValue(); + Set filters = new LinkedHashSet(g.getFilters()); + + boolean genotypeIsPhased = true; + String sample = g.getSampleName(); + + ArrayList beagleProbabilities = beagleProbsFeature.getProbLikelihoods().get(sample); + ArrayList beagleGenotypePairs = beaglePhasedFeature.getGenotypes().get(sample); + + + // We have phased genotype in hp. Need to set the isRef field in the allele. + List alleles = new ArrayList(); + + String alleleA = beagleGenotypePairs.get(0); + String alleleB = beagleGenotypePairs.get(1); + + byte[] r = alleleA.getBytes(); + byte rA = r[0]; + + Boolean isRefA = (refByte == rA); + + Allele refAllele = Allele.create(r, isRefA ); + alleles.add(refAllele); + + r = alleleB.getBytes(); + byte rB = r[0]; + + Boolean isRefB = (refByte == rB); + Allele altAllele = Allele.create(r,isRefB); + alleles.add(altAllele); + + // Compute new GQ field = -10*log10Pr(Genotype call is wrong) + // Beagle gives probability that genotype is AA, AB and BB. + // Which, by definition, are prob of hom ref, het and hom var. + Double probWrongGenotype, genotypeQuality; + Double homRefProbability = Double.valueOf(beagleProbabilities.get(0)); + Double hetProbability = Double.valueOf(beagleProbabilities.get(1)); + Double homVarProbability = Double.valueOf(beagleProbabilities.get(2)); + + if (isRefA && isRefB) // HomRef call + probWrongGenotype = hetProbability + homVarProbability; + else if ((isRefB && !isRefA) || (isRefA && !isRefB)) + probWrongGenotype = homRefProbability + homVarProbability; + else // HomVar call + probWrongGenotype = hetProbability + homRefProbability; + + + if (probWrongGenotype < MIN_PROB_ERROR) + genotypeQuality = MAX_GENOTYPE_QUALITY; + else + genotypeQuality = -log10(probWrongGenotype); + + Genotype imputedGenotype = new Genotype(genotype.getKey(), alleles, genotypeQuality, filters, g.getAttributes(), genotypeIsPhased); + + + genotypes.put(genotype.getKey(), imputedGenotype); + } - - if (beagleSampleRecords.containsKey(loc.toString())) { - // Get record associated with this marker from output Beagle data - BeagleSampleRecord bglRecord = beagleSampleRecords.get(loc.toString()); - - // get reference base for current position - byte refByte = ref.getBase(); - - // make new Genotypes based on Beagle results - Map genotypes; - - genotypes = new HashMap(vc_input.getGenotypes().size()); - - // for each genotype, create a new object with Beagle information on it - for ( Map.Entry genotype : vc_input.getGenotypes().entrySet() ) { - - Genotype g = genotype.getValue(); - Set filters = new LinkedHashSet(g.getFilters()); - - // modify g here with Beagle info - boolean genotypeIsPhased = true; - String sample = g.getSampleName(); - - LikelihoodTrios gtprobs = bglRecord.GenotypeProbabilities.get(sample); -// LikelihoodTrios inplike = bglRecord.InputLikelihoods.get(sample); - HaplotypePair hp = bglRecord.PhasedHaplotypes.get(sample); - - // We have phased genotype in hp. Need to set the isRef field in the allele. - List alleles = new ArrayList(); - - byte r[] = hp.AlleleA.getBases(); - byte rA = r[0]; - - Boolean isRefA = (refByte == rA); - - Allele refAllele = Allele.create(r, isRefA ); - alleles.add(refAllele); - - r = hp.AlleleB.getBases(); - byte rB = r[0]; - - Boolean isRefB = (refByte == rB); - Allele altAllele = Allele.create(r,isRefB); - alleles.add(altAllele); + VariantContext filteredVC = new VariantContext("outputvcf", vc_input.getLocation(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.getFilters(), vc_input.getAttributes()); - - // Compute new GQ field = -10*log10Pr(Genotype call is wrong) - // Beagle gives probability that genotype is AA, AB and BB. - // Which, by definition, are prob of hom ref, het and hom var. - Double probWrongGenotype, genotypeQuality; - - if (isRefA && isRefB) // HomRef call - probWrongGenotype = gtprobs.HetLikelihood + gtprobs.HomVarLikelihood; - else if ((isRefB && !isRefA) || (isRefA && !isRefB)) - probWrongGenotype = gtprobs.HomRefLikelihood + gtprobs.HomVarLikelihood; - else // HomVar call - probWrongGenotype = gtprobs.HetLikelihood + gtprobs.HomRefLikelihood; - - - if (probWrongGenotype < MIN_PROB_ERROR) - genotypeQuality = MAX_GENOTYPE_QUALITY; - else - genotypeQuality = -log10(probWrongGenotype); - - Genotype imputedGenotype = new Genotype(genotype.getKey(), alleles, genotypeQuality, filters, g.getAttributes(), genotypeIsPhased); - - genotypes.put(genotype.getKey(), imputedGenotype); - } - - - VariantContext filteredVC = new VariantContext("outputvcf", vc_input.getLocation(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.getFilters(), vc_input.getAttributes()); - - - Set altAlleles = filteredVC.getAlternateAlleles(); - StringBuffer altAlleleCountString = new StringBuffer(); - for ( Allele allele : altAlleles ) { - if ( altAlleleCountString.length() > 0 ) - altAlleleCountString.append(","); - altAlleleCountString.append(filteredVC.getChromosomeCount(allele)); - } - - VCFRecord vcf = VariantContextAdaptors.toVCF(filteredVC, ref.getBase()); - - if ( filteredVC.getChromosomeCount() > 0 ) { - vcf.addInfoField(VCFRecord.ALLELE_NUMBER_KEY, String.format("%d", filteredVC.getChromosomeCount())); - if ( altAlleleCountString.length() > 0 ) { - vcf.addInfoField(VCFRecord.ALLELE_COUNT_KEY, altAlleleCountString.toString()); - vcf.addInfoField(VCFRecord.ALLELE_FREQUENCY_KEY, String.format("%4.2f", - Double.valueOf(altAlleleCountString.toString())/(filteredVC.getChromosomeCount()))); - } - } - - vcf.addInfoField("R2", (bglRecord.getR2Value()).toString() ); - vcfWriter.addRecord(vcf); + Set altAlleles = filteredVC.getAlternateAlleles(); + StringBuffer altAlleleCountString = new StringBuffer(); + for ( Allele allele : altAlleles ) { + if ( altAlleleCountString.length() > 0 ) + altAlleleCountString.append(","); + altAlleleCountString.append(filteredVC.getChromosomeCount(allele)); } + VCFRecord vcf = VariantContextAdaptors.toVCF(filteredVC, ref.getBase()); + + if ( filteredVC.getChromosomeCount() > 0 ) { + vcf.addInfoField(VCFRecord.ALLELE_NUMBER_KEY, String.format("%d", filteredVC.getChromosomeCount())); + if ( altAlleleCountString.length() > 0 ) { + vcf.addInfoField(VCFRecord.ALLELE_COUNT_KEY, altAlleleCountString.toString()); + vcf.addInfoField(VCFRecord.ALLELE_FREQUENCY_KEY, String.format("%4.2f", + Double.valueOf(altAlleleCountString.toString())/(filteredVC.getChromosomeCount()))); + } + } + + vcf.addInfoField("R2", beagleR2Feature.getR2value().toString() ); + vcfWriter.addRecord(vcf); + + return 1; } @@ -448,22 +267,22 @@ public class BeagleOutputToVCFWalker extends RodWalker { /** * Increment the number of loci processed. * - * @param value result of the map. - * @param sum accumulator for the reduce. - * @return the new number of loci processed. - */ - public Integer reduce(Integer value, Integer sum) { - return sum + value; - } + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return the new number of loci processed. + */ + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } - /** - * Tell the user the number of loci processed and close out the new variants file. - * - * @param result the number of loci seen. - */ - public void onTraversalDone(Integer result) { - out.printf("Processed %d loci.\n", result); + /** + * Tell the user the number of loci processed and close out the new variants file. + * + * @param result the number of loci seen. + */ + public void onTraversalDone(Integer result) { + out.printf("Processed %d loci.\n", result); - vcfWriter.close(); - } - } + vcfWriter.close(); + } +}