diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java index df710146a..3f5dd1396 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java @@ -68,6 +68,9 @@ public class BeagleOutputToVCFWalker extends RodWalker { @Deprecated protected String oldOutputArg; + @Argument(fullName="dont_mark_monomorphic_sites_as_filtered", shortName="keep_monomorphic", doc="If provided, we won't filter sites that beagle tags as monomorphic. Useful for imputing a sample's genotypes from a reference panel" ,required=false) + public boolean DONT_FILTER_MONOMORPHIC_SITES = false; + @Argument(fullName="no" + "call_threshold", shortName="ncthr", doc="Threshold of confidence at which a genotype won't be called", required=false) private double noCallThreshold = 0.0; @@ -304,7 +307,7 @@ public class BeagleOutputToVCFWalker extends RodWalker { } VariantContext filteredVC; - if ( beagleVarCounts > 0) + if ( beagleVarCounts > 0 || DONT_FILTER_MONOMORPHIC_SITES ) filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.filtersWereApplied() ? vc_input.getFilters() : null, vc_input.getAttributes()); else { Set removedFilters = vc_input.filtersWereApplied() ? new HashSet(vc_input.getFilters()) : new HashSet(1); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java index 8edd31eb5..916bfa9e3 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java @@ -45,6 +45,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.vcf.VCFUtils; import java.io.File; @@ -64,6 +65,10 @@ public class ProduceBeagleInputWalker extends RodWalker { @Output(doc="File to which BEAGLE input should be written",required=true) protected PrintStream beagleWriter = null; + @Output(doc="File to which BEAGLE markers should be written", shortName="markers", fullName = "markers", required = false) + protected PrintStream markers = null; + int markerCounter = 1; + @Hidden @Input(doc="VQSqual calibration file", shortName = "cc", required=false) protected File VQSRCalibrationFile = null; @@ -73,10 +78,6 @@ public class ProduceBeagleInputWalker extends RodWalker { @Argument(doc="VQSqual key", shortName = "vqskey", required=false) protected String VQSLOD_KEY = "VQSqual"; -// @Hidden -// @Argument(doc="Include filtered records", shortName = "ifr", fullName = "IncludeFilteredRecords", required=false) -// protected boolean includeFilteredRecords = false; - @Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false) public double insertedNoCallRate = 0; @Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false) @@ -92,12 +93,13 @@ public class ProduceBeagleInputWalker extends RodWalker { @Argument(fullName = "variant_genotype_ptrue", shortName = "varp", doc = "Flat probability prior to assign to variant (not validation) genotypes. Does not override GL field.", required = false) public double variantPrior = 0.96; - private Set samples = null; private Set BOOTSTRAP_FILTER = new HashSet( Arrays.asList("bootstrap") ); private Random generator; private int bootstrapSetSize = 0; private int testSetSize = 0; + private CachingFormatter formatter = new CachingFormatter("%5.4f ", 100000); + private int certainFPs = 0; public void initialize() { generator = new Random(); @@ -122,17 +124,15 @@ public class ProduceBeagleInputWalker extends RodWalker { } public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - if( tracker != null ) { GenomeLoc loc = context.getLocation(); - VariantContext variant_eval; - VariantContext validation_eval; + VariantContext variant_eval = tracker.getVariantContext(ref, ROD_NAME, null, loc, true); + VariantContext validation_eval = tracker.getVariantContext(ref,VALIDATION_ROD_NAME,null,loc, true); - variant_eval = tracker.getVariantContext(ref, ROD_NAME, null, loc, true); - validation_eval = tracker.getVariantContext(ref,VALIDATION_ROD_NAME,null,loc, true); if ( goodSite(variant_eval,validation_eval) ) { - if ( useValidation(variant_eval,validation_eval, ref) ) { - writeBeagleOutput(validation_eval,variant_eval,true,validationPrior); + if ( useValidation(validation_eval, ref) ) { + writeBeagleOutput(validation_eval, variant_eval, true, validationPrior); + return 1; } else { if ( goodSite(variant_eval) ) { writeBeagleOutput(variant_eval,validation_eval,false,variantPrior); @@ -141,7 +141,6 @@ public class ProduceBeagleInputWalker extends RodWalker { return 0; } } - return 1; } else { return 0; } @@ -155,11 +154,23 @@ public class ProduceBeagleInputWalker extends RodWalker { } public boolean goodSite(VariantContext v) { - return v != null && ! v.isFiltered() && v.isBiallelic() && v.hasGenotypes(); - //return v != null && (includeFilteredRecords || ! v.isFiltered()) && v.isBiallelic() && v.hasGenotypes(); + if ( canBeOutputToBeagle(v) ) { + if ( VQSRCalibrator != null && VQSRCalibrator.certainFalsePositive(VQSLOD_KEY, v) ) { + certainFPs++; + return false; + } else { + return true; + } + } else { + return false; + } } - public boolean useValidation(VariantContext variant, VariantContext validation, ReferenceContext ref) { + public static boolean canBeOutputToBeagle(VariantContext v) { + return v != null && ! v.isFiltered() && v.isBiallelic() && v.hasGenotypes(); + } + + public boolean useValidation(VariantContext validation, ReferenceContext ref) { if( goodSite(validation) ) { // if using record keeps us below expected proportion, use it logger.debug(String.format("boot: %d, test: %d, total: %d", bootstrapSetSize, testSetSize, bootstrapSetSize+testSetSize+1)); @@ -167,7 +178,7 @@ public class ProduceBeagleInputWalker extends RodWalker { if ( bootstrapVCFOutput != null ) { bootstrapVCFOutput.add(VariantContext.modifyFilters(validation, BOOTSTRAP_FILTER), ref.getBase() ); } - bootstrapSetSize ++; + bootstrapSetSize++; return true; } else { if ( bootstrapVCFOutput != null ) { @@ -191,7 +202,9 @@ public class ProduceBeagleInputWalker extends RodWalker { GenomeLoc currentLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),preferredVC); StringBuffer beagleOut = new StringBuffer(); - beagleOut.append(String.format("%s:%d ",currentLoc.getContig(),currentLoc.getStart())); + String marker = String.format("%s:%d ",currentLoc.getContig(),currentLoc.getStart()); + beagleOut.append(marker); + if ( markers != null ) markers.append(marker).append("\t").append(Integer.toString(markerCounter++)).append("\t"); for ( Allele allele : preferredVC.getAlleles() ) { String bglPrintString; if (allele.isNoCall() || allele.isNull()) @@ -200,7 +213,9 @@ public class ProduceBeagleInputWalker extends RodWalker { bglPrintString = allele.getBaseString(); // get rid of * in case of reference allele beagleOut.append(String.format("%s ", bglPrintString)); + if ( markers != null ) markers.append(bglPrintString).append("\t"); } + if ( markers != null ) markers.append("\n"); Map preferredGenotypes = preferredVC.getGenotypes(); Map otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null; @@ -265,13 +280,15 @@ public class ProduceBeagleInputWalker extends RodWalker { } private void writeSampleLikelihoods( StringBuffer out, VariantContext vc, double[] log10Likelihoods ) { - if ( VQSRCalibrator != null ) + if ( VQSRCalibrator != null ) { log10Likelihoods = VQSRCalibrator.includeErrorRateInLikelihoods(VQSLOD_KEY, vc, log10Likelihoods); + } double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(log10Likelihoods); // see if we need to randomly mask out genotype in this position. for (double likeVal: normalizedLikelihoods) { - out.append(String.format("%5.4f ",likeVal)); + out.append(formatter.format(likeVal)); +// out.append(String.format("%5.4f ",likeVal)); } } @@ -281,11 +298,13 @@ public class ProduceBeagleInputWalker extends RodWalker { } public Integer reduce( Integer value, Integer sum ) { - return 0; // Nothing to do here + return value + sum; // count up the sites } - public void onTraversalDone( Integer sum ) { - + public void onTraversalDone( Integer includedSites ) { + logger.info("Sites included in beagle likelihoods file : " + includedSites); + logger.info(String.format("Certain false positive found from recalibration curve : %d (%.2f%%)", + certainFPs, (100.0 * certainFPs) / (Math.max(certainFPs + includedSites, 1)))); } private void initializeVcfWriter() { @@ -301,4 +320,114 @@ public class ProduceBeagleInputWalker extends RodWalker { bootstrapVCFOutput.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames))); } + public static class CachingFormatter { + private int maxCacheSize = 0; + private String format; + private LRUCache cache; + + public String getFormat() { + return format; + } + + public String format(double value) { + String f = cache.get(value); + if ( f == null ) { + f = String.format(format, value); + cache.put(value, f); +// if ( cache.usedEntries() < maxCacheSize ) { +// System.out.printf("CACHE size %d%n", cache.usedEntries()); +// } else { +// System.out.printf("CACHE is full %f%n", value); +// } +// } +// } else { +// System.out.printf("CACHE hit %f%n", value); +// } + } + + return f; + } + + public CachingFormatter(String format, int maxCacheSize) { + this.maxCacheSize = maxCacheSize; + this.format = format; + this.cache = new LRUCache(maxCacheSize); + } + } + + /** + * An LRU cache, based on LinkedHashMap. + * + *

+ * This cache has a fixed maximum number of elements (cacheSize). + * If the cache is full and another entry is added, the LRU (least recently used) entry is dropped. + * + *

+ * This class is thread-safe. All methods of this class are synchronized. + * + *

+ * Author: Christian d'Heureuse, Inventec Informatik AG, Zurich, Switzerland
+ * Multi-licensed: EPL / LGPL / GPL / AL / BSD. + */ + public static class LRUCache { + + private static final float hashTableLoadFactor = 0.75f; + + private LinkedHashMap map; + private int cacheSize; + + /** + * Creates a new LRU cache. + * @param cacheSize the maximum number of entries that will be kept in this cache. + */ + public LRUCache (int cacheSize) { + this.cacheSize = cacheSize; + int hashTableCapacity = (int)Math.ceil(cacheSize / hashTableLoadFactor) + 1; + map = new LinkedHashMap(hashTableCapacity, hashTableLoadFactor, true) { + // (an anonymous inner class) + private static final long serialVersionUID = 1; + @Override protected boolean removeEldestEntry (Map.Entry eldest) { + return size() > LRUCache.this.cacheSize; }}; } + + /** + * Retrieves an entry from the cache.
+ * The retrieved entry becomes the MRU (most recently used) entry. + * @param key the key whose associated value is to be returned. + * @return the value associated to this key, or null if no value with this key exists in the cache. + */ + public synchronized V get (K key) { + return map.get(key); } + + /** + * Adds an entry to this cache. + * The new entry becomes the MRU (most recently used) entry. + * If an entry with the specified key already exists in the cache, it is replaced by the new entry. + * If the cache is full, the LRU (least recently used) entry is removed from the cache. + * @param key the key with which the specified value is to be associated. + * @param value a value to be associated with the specified key. + */ + public synchronized void put (K key, V value) { + map.put (key, value); } + + /** + * Clears the cache. + */ + public synchronized void clear() { + map.clear(); } + + /** + * Returns the number of used entries in the cache. + * @return the number of entries currently in the cache. + */ + public synchronized int usedEntries() { + return map.size(); } + + /** + * Returns a Collection that contains a copy of all cache entries. + * @return a Collection with a copy of the cache content. + */ + public synchronized Collection> getAll() { + return new ArrayList>(map.entrySet()); } + + } // end class LRUCache } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphasedWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphasedWalker.java new file mode 100755 index 000000000..9a9aff457 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphasedWalker.java @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.beagle; + +import org.broad.tribble.util.variantcontext.Allele; +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broad.tribble.vcf.VCFHeader; +import org.broad.tribble.vcf.VCFHeaderLine; +import org.broad.tribble.vcf.VCFWriter; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RMD; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.vcf.VCFUtils; + +import java.io.PrintStream; +import java.util.*; + +/** + * Produces an input file to Beagle imputation engine, listing unphased, hard-called genotypes for a single sample + * in input variant file. Will additional hold back a fraction of the sites for evaluation, marking the + * genotypes at that sites as missing, and writing the truth of these sites to a second VCF file + */ +@Requires(value={},referenceMetaData=@RMD(name= VariantsToBeagleUnphasedWalker.ROD_NAME, type=VariantContext.class)) +public class VariantsToBeagleUnphasedWalker extends RodWalker { + public static final String ROD_NAME = "variant"; + + @Output(doc="File to which BEAGLE unphased genotypes should be written",required=true) + protected PrintStream beagleWriter = null; + + @Argument(fullName = "bootstrap_fraction", shortName = "bs", doc = "Proportion of records to be used in bootstrap set", required = false) + public double bootstrap = 0.0; + + @Argument(fullName = "bootstrap_vcf",shortName = "bsvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false) + VCFWriter bootstrapVCFOutput = null; + + @Argument(fullName = "missing", shortName = "missing", doc = "String to identify missing data in beagle output", required = false) + public String MISSING = "?"; + + private Set samples = null; + private int bootstrapSetSize = 0; + private int testSetSize = 0; + + public void initialize() { + samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(ROD_NAME)); + + beagleWriter.print("I marker alleleA alleleB"); + for ( String sample : samples ) + beagleWriter.print(String.format(" %s %s", sample, sample)); + + beagleWriter.println(); + + if ( bootstrap < 0.0 | bootstrap > 1.0 ) + throw new UserException.BadArgumentValue("bootstrap", "Bootstrap value must be fraction between 0 and 1"); + + if ( bootstrapVCFOutput != null ) { + Set hInfo = VCFUtils.getHeaderFields(getToolkit()); + bootstrapVCFOutput.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit()))); + } + } + + /** + * Iterate over each site, emitting the BEAGLE unphased genotypes file format + * @param tracker + * @param ref + * @param context + * @return + */ + public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { + if( tracker != null ) { + GenomeLoc loc = context.getLocation(); + VariantContext vc = tracker.getVariantContext(ref, ROD_NAME, null, loc, true); + + if ( ProduceBeagleInputWalker.canBeOutputToBeagle(vc) ) { + // do we want to hold back this site? + boolean makeMissing = dropSite(vc); + + // if we are holding it back and we are writing a bootstrap VCF, write it out + if ( makeMissing && bootstrapVCFOutput != null ) { + bootstrapVCFOutput.add(vc, ref.getBase()); + } + + // regardless, all sites are written to the unphased genotypes file, marked as missing if appropriate + writeUnphasedBeagleOutput(vc, makeMissing); + } + } + + return 0; + } + + /** + * Do we want to hold back this site for bootstrap? Considers the bootstrap fraction member variable + * + * @param vc + * @return + */ + public boolean dropSite(VariantContext vc) { + if ( (bootstrapSetSize+1.0)/(1.0+bootstrapSetSize+testSetSize) <= bootstrap ) { + bootstrapSetSize++; + return true; + } else { + testSetSize++; + return false; + } + } + + public void writeUnphasedBeagleOutput(VariantContext vc, boolean makeMissing) { + GenomeLoc currentLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),vc); + StringBuffer beagleOut = new StringBuffer(); + + String marker = String.format("%s:%d ",currentLoc.getContig(), currentLoc.getStart()); + beagleOut.append("M ").append(marker); + + // write out the alleles at this site + for ( Allele allele : vc.getAlleles() ) { + beagleOut.append(allele.isNoCall() || allele.isNull() ? "-" : allele.getBaseString()).append(" "); + } + + // write out sample level genotypes + for ( String sample : samples ) { + Genotype genotype = vc.getGenotype(sample); + if ( ! makeMissing && genotype.isCalled() ) { + addAlleles(beagleOut, genotype); + } else { + addAlleles(beagleOut, MISSING, MISSING); + } + } + + beagleWriter.println(beagleOut.toString()); + } + + private void addAlleles(StringBuffer buf, Genotype g) { + addAlleles(buf, g.getAllele(0).getBaseString(), g.getAllele(1).getBaseString()); + + } + + private void addAlleles(StringBuffer buf, String a, String b) { + buf.append(a).append(" ").append(b); + } + + public Integer reduceInit() { return 0; } + public Integer reduce( Integer value, Integer sum ) { return value + sum; } + + public void onTraversalDone( Integer includedSites ) { + logger.info("Sites included in beagle genotypes file : " + includedSites); + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java index f67b51970..b1f6596cb 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java @@ -22,6 +22,7 @@ import java.util.List; public class VQSRCalibrationCurve { private final static boolean DEBUG = false; List points; + public static final double CERTAIN_FALSE_POSITIVE = -1; private static class VQSRRange { double start, stop, truePositiveRate; @@ -52,7 +53,9 @@ public class VQSRCalibrationCurve { for ( String line : new XReadLines(source).readLines() ) { if ( ! line.trim().isEmpty() ) { String[] parts = line.split("\\s+"); - points.add(new VQSRRange(Double.parseDouble(parts[0]), Double.parseDouble(parts[1]), 1 - Double.parseDouble(parts[2]))); + double fpRate = Double.parseDouble(parts[2]); + double tpRate = fpRate >= 1.0 ? CERTAIN_FALSE_POSITIVE : 1.0 - fpRate; + points.add(new VQSRRange(Double.parseDouble(parts[0]), Double.parseDouble(parts[1]), tpRate)); } } } catch ( FileNotFoundException e ) { @@ -70,6 +73,11 @@ public class VQSRCalibrationCurve { this.points = points; } + public boolean certainFalsePositive(String VQSRQualKey, VariantContext vc) { + return probTrueVariant(VQSRQualKey, vc) == CERTAIN_FALSE_POSITIVE; + } + + public double probTrueVariant(double VQSRqual) { for ( VQSRRange r : points ) { if ( VQSRqual <= r.getStart() && VQSRqual > r.getStop() ) @@ -90,21 +98,33 @@ public class VQSRCalibrationCurve { } } + /** + * Returns a likelihoods vector adjusted by the probability that the site is an error. Returns a + * null vector if the probability of the site being real is 0.0 + * @param VQSRQualKey + * @param vc + * @param log10Likelihoods + * @return + */ public double[] includeErrorRateInLikelihoods(String VQSRQualKey, VariantContext vc, double[] log10Likelihoods) { double[] updated = new double[log10Likelihoods.length]; double alpha = probTrueVariant(VQSRQualKey, vc); - double qual = vc.getAttributeAsDouble(VQSRQualKey); // todo -- remove me - double noInfoPr = 1.0 / 3; - if ( DEBUG ) System.out.printf("------------------------------%n"); - for ( int i = 0; i < log10Likelihoods.length; i++) { - double p = Math.pow(10, log10Likelihoods[i]); - double q = alpha * p + (1-alpha) * noInfoPr; - if ( DEBUG ) System.out.printf(" vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", qual, p, alpha, q); - updated[i] = Math.log10(q); - } - return updated; + if ( alpha == CERTAIN_FALSE_POSITIVE ) + return null; + else { + double noInfoPr = 1.0 / 3; + if ( DEBUG ) System.out.printf("------------------------------%n"); + for ( int i = 0; i < log10Likelihoods.length; i++) { + double p = Math.pow(10, log10Likelihoods[i]); + double q = alpha * p + (1-alpha) * noInfoPr; + if ( DEBUG ) System.out.printf(" vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", vc.getAttributeAsDouble(VQSRQualKey), p, alpha, q); + updated[i] = Math.log10(q); + } + + return updated; + } } diff --git a/scala/qscript/oneoffs/depristo/RefineGenotypesWithBeagle.q b/scala/qscript/oneoffs/depristo/RefineGenotypesWithBeagle.q index 402d7845d..3aceb0fa3 100755 --- a/scala/qscript/oneoffs/depristo/RefineGenotypesWithBeagle.q +++ b/scala/qscript/oneoffs/depristo/RefineGenotypesWithBeagle.q @@ -20,9 +20,18 @@ class RefineGenotypesWithBeagle extends QScript { @Argument(doc="X",required=false,shortName="cc") var CALIBRATION_CURVE: File = new File("vqsr.calibration.curve") @Argument(doc="X",required=false,shortName="test") var TEST: Boolean = false + @Argument(doc="Tmpdir",required=false,shortName="tmpdir") var TMPDIR: File = new File("./") + // assessing imputation performance + @Argument(doc="BAM File for genotyping",required=false, shortName="bam") var bam: File = null + @Argument(doc="Percent of sites that should be left out of BAM VCF to assess imputation", required=false, shortName="flo") + var fractionsLeftOut: List[Double] = List(0.1, 0.2, 0.5, 0.9) + // todo -- this might be best to think about in a different unit -- marker density per bp + + val MISSING_KEY = "?" val HM3_VCF: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf") val OMNI_VCF: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf") + val dbSNP_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132_b37.leftAligned.vcf" trait GATKArgs extends CommandLineGATK { this.reference_sequence = qscript.reference @@ -30,94 +39,194 @@ class RefineGenotypesWithBeagle extends QScript { this.memoryLimit = Some(2) } - class GunzipFile(in: File, out:File ) extends CommandLineFunction { - @Input(doc="file to gunzip") var inp = in - @Output(doc="file to gunzip to") var outp = out + // -------------------------------------------------------------------------------- + // + // GENOTYPING SPECIFIC SITES IN A BAM FILE + // + // -------------------------------------------------------------------------------- - def commandLine = "gunzip -c %s > %s".format(inp.getAbsolutePath, outp.getAbsolutePath) - } + class GenotypeBAMAtSites(@Input bam: File, @Input sitesVCF: File, @Output genotypesVCF: File) extends UnifiedGenotyper with GATKArgs { + this.input_file = List(bam) + this.o = genotypesVCF + this.stand_call_conf = Some(0.0) + this.out_mode = Some(org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES) + this.gt_mode = Some(org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) + this.rodBind :+= new RodBind("alleles","VCF",sitesVCF) - class BeagleRefinement(moreBeagleArgs: String = "") extends CommandLineFunction { - @Input(doc="The beagle input file") var beagleInput: File = _ - var beagleOutputBase: String = _ - var beagleMemoryGigs: Int = BEAGLE_MEM_IN_GB + // we only want chromosome counts annotations + this.BTI = "alleles" + this.G = List("none") + this.A :+= "ChromosomeCounts" + this.nsl = true - /** - * Note: These get set - */ - @Output(doc="The beagle phased file") var beaglePhasedFile: File = _ - @Output(doc="The beagle likelihood file") var beagleLikelihoods: File = _ - @Output(doc="The beagle r2 file") var beagleRSquared: File = _ - var beagleOutputDir: String = _ - - def freezeOutputs = { - if ( beagleInput.getParent == null ) { - beagleOutputDir = "" - } else { - beagleOutputDir = beagleInput.getParent - } - beaglePhasedFile = new File(beagleOutputDir + beagleOutputBase+"."+beagleInput.getName+".phased.gz") - beagleLikelihoods = new File(beagleOutputDir + beagleOutputBase+"."+beagleInput.getName+".gprobs.gz") - beagleRSquared = new File(beagleOutputDir + beagleOutputBase +"."+beagleInput.getName+".r2") + // make sure we have the right intervals + if ( interval != null ) { + this.intervalsString = List(interval) + this.BTIMR = Some(org.broadinstitute.sting.utils.interval.IntervalSetRule.INTERSECTION) } - - def commandLine = "java -Djava.io.tmpdir=%s -Xmx%dg -jar %s like=%s %s out=%s".format(beagleInput.getParent,beagleMemoryGigs,beagleJar,beagleInput.getAbsolutePath,moreBeagleArgs,beagleOutputBase) } - def RefineGenotypes(inputVCF: File, outputVCF: File, useCalibrationCurve: Boolean, moreBeagleArgs: String = "") = { - var beagleInput = new ProduceBeagleInput with GATKArgs + // -------------------------------------------------------------------------------- + // + // BEAGLE COMMANDS + // + // -------------------------------------------------------------------------------- + + class BeagleCommand(outputBase: String) extends CommandLineFunction { + this.memoryLimit = Some(BEAGLE_MEM_IN_GB) + + // Note: These get set + @Output val beaglePhasedFile: File = new File(outputBase +".phased.gz") + @Output val beagleLikelihoods: File = new File(outputBase +".gprobs.gz") + @Output val beagleRSquared: File = new File(outputBase +".r2") + + def commandLine = "java -Djava.io.tmpdir=%s -Xmx%dg -jar %s out=ignore.me omitprefix=true".format(TMPDIR, BEAGLE_MEM_IN_GB, beagleJar) + } + + class RefineGenotypesWithBeagle(@Input beagleInput: File, moreBeagleArgs: String = "") + extends BeagleCommand(beagleInput.getName) { + def myArgs = " like=%s %s".format(beagleInput.getAbsolutePath, moreBeagleArgs) + override def commandLine = super.commandLine + myArgs + } + + class ImputeMissingGenotypesWithReferencePanel(@Input evalBeagle: File, + @Input phasedBeagleFile: File, + @Input markers: File, + moreBeagleArgs: String = "") + extends BeagleCommand(evalBeagle.getName) { + def myArgs = " unphased=%s phased=%s markers=%s %s".format(evalBeagle.getAbsolutePath, + phasedBeagleFile.getAbsolutePath, markers.getAbsolutePath, moreBeagleArgs) + override def commandLine = super.commandLine + myArgs + } + + class GunzipFile(@Input val in: File, @Output val out: File) extends CommandLineFunction { + def commandLine = "gunzip -c %s > %s".format(in.getAbsolutePath, out.getAbsolutePath) + } + + // -------------------------------------------------------------------------------- + // + // CREATING AND EVALUATING REFERENCE PANELS + // + // -------------------------------------------------------------------------------- + + class ReferencePanelBuilder(inputVCF: File, outputVCF: File, useCalibrationCurve: Boolean, moreBeagleArgs: String = "") { + val beagleInput = new ProduceBeagleInput with GATKArgs if ( interval != null ) beagleInput.intervalsString = List(interval) beagleInput.variantVCF = inputVCF beagleInput.out = swapExt(outputVCF,".vcf",".beagle") if ( useCalibrationCurve ) beagleInput.cc = CALIBRATION_CURVE + beagleInput.markers = swapExt(outputVCF, ".vcf", ".markers.txt") + beagleInput.isIntermediate = true - var refine = new BeagleRefinement(moreBeagleArgs) - refine.beagleInput = beagleInput.out - refine.beagleOutputBase = outputVCF.getName + ".bout" - refine.beagleMemoryGigs = BEAGLE_MEM_IN_GB - refine.memoryLimit = Some(BEAGLE_MEM_IN_GB) - refine.freezeOutputs + val refine = new RefineGenotypesWithBeagle(beagleInput.out, moreBeagleArgs) - var unzipPhased = new GunzipFile(refine.beaglePhasedFile,swapExt(refine.beaglePhasedFile,".gz",".bgl")) - var unzipProbs = new GunzipFile(refine.beagleLikelihoods,swapExt(refine.beagleLikelihoods,".gz",".bgl")) -// unzipPhased.isIntermediate = true -// unzipProbs.isIntermediate = true + val unzipPhased = new GunzipFile(refine.beaglePhasedFile,swapExt(refine.beaglePhasedFile,".gz",".bgl")) + val unzipProbs = new GunzipFile(refine.beagleLikelihoods,swapExt(refine.beagleLikelihoods,".gz",".bgl")) + //unzipPhased.isIntermediate = true + //unzipProbs.isIntermediate = true - var vcfConvert = new BeagleOutputToVCF with GATKArgs + val vcfConvert = new BeagleOutputToVCF with GATKArgs vcfConvert.variantVCF = inputVCF vcfConvert.rodBind :+= new RodBind("beagleR2","BEAGLE",refine.beagleRSquared) - vcfConvert.rodBind :+= new RodBind("beaglePhased","BEAGLE",unzipPhased.outp) - vcfConvert.rodBind :+= new RodBind("beagleProbs","BEAGLE",unzipProbs.outp) + vcfConvert.rodBind :+= new RodBind("beaglePhased","BEAGLE",unzipPhased.out) + vcfConvert.rodBind :+= new RodBind("beagleProbs","BEAGLE",unzipProbs.out) vcfConvert.out = outputVCF - for ( cmd: CommandLineFunction <- List(beagleInput, refine, unzipPhased, unzipProbs, vcfConvert) ) - add(cmd) + def getMarkers = beagleInput.markers + def getPanelPhasedHaplotypes = refine.beaglePhasedFile + def getPanelVCF = vcfConvert.out + + def enqueueCommands() = { + for ( cmd: CommandLineFunction <- List(beagleInput, refine, unzipPhased, unzipProbs, vcfConvert) ) + add(cmd) + } } - class MyEval(@Input(doc="Evaluation file") eval: File) extends VariantEval with GATKArgs { + class EvaluateReferencePanel(@Input evalVCF: File, + @Output outputVCF: File, + panel: ReferencePanelBuilder, + percentLeftOut: Double, + moreBeagleArgs: String = "") { + val evalBeagle = new VariantsToBeagleUnphased with GATKArgs + if ( interval != null ) evalBeagle.intervalsString = List(interval) + evalBeagle.variantVCF = evalVCF + evalBeagle.out = swapExt(outputVCF,".vcf",".unphased.beagle") + evalBeagle.bs = Some(percentLeftOut) + evalBeagle.bsvcf = swapExt(outputVCF,".vcf",".missing.vcf") + evalBeagle.missing = MISSING_KEY + //evalBeagle.isIntermediate = true + + val refine = new ImputeMissingGenotypesWithReferencePanel(evalBeagle.out, panel.getPanelPhasedHaplotypes, panel.getMarkers, moreBeagleArgs) + + val unzipPhased = new GunzipFile(refine.beaglePhasedFile,swapExt(refine.beaglePhasedFile,".gz",".bgl")) + val unzipProbs = new GunzipFile(refine.beagleLikelihoods,swapExt(refine.beagleLikelihoods,".gz",".bgl")) + //unzipPhased.isIntermediate = true + //unzipProbs.isIntermediate = true + + val vcfConvert = new BeagleOutputToVCF with GATKArgs + vcfConvert.variantVCF = evalVCF + vcfConvert.rodBind :+= new RodBind("beagleR2","BEAGLE",refine.beagleRSquared) + vcfConvert.rodBind :+= new RodBind("beaglePhased","BEAGLE",unzipPhased.out) + vcfConvert.rodBind :+= new RodBind("beagleProbs","BEAGLE",unzipProbs.out) + vcfConvert.out = outputVCF + vcfConvert.keep_monomorphic = true + + def getBootstrap: File = evalBeagle.bsvcf + + def enqueueCommands() = { + for ( cmd: CommandLineFunction <- List(evalBeagle, refine, unzipPhased, unzipProbs, vcfConvert) ) + add(cmd) + } + } + + class EvalPanelAtChipSites(@Input eval: File) extends VariantEval with GATKArgs { this.noST = true this.evalModule :+= "GenotypeConcordance" this.o = swapExt(eval, ".vcf", ".vcf.eval") this.rodBind :+= RodBind("eval", "VCF", eval) this.rodBind :+= RodBind("comp_hm3", "VCF", HM3_VCF) this.rodBind :+= RodBind("comp_omni", "VCF", OMNI_VCF) - if ( EvalInterval != null ) { - Console.printf("EvalInterval " + EvalInterval) - this.intervalsString = List(EvalInterval) - } + if ( EvalInterval != null ) this.intervalsString = List(EvalInterval) + } + + class EvalPanelAtBAMCalledSites(@Input imputedVCF: File, @Input bamGenotypes: File, @Input bootstrap: File) extends VariantEval with GATKArgs { + this.evalModule :+= "GenotypeConcordance" + this.o = swapExt(imputedVCF, ".vcf", ".vcf.eval") + this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP_b37) + this.rodBind :+= RodBind("eval", "VCF", imputedVCF) + this.rodBind :+= RodBind("comp_bam_genotypes", "VCF", bamGenotypes) + this.rodBind :+= RodBind("comp_bootstrap", "VCF", bootstrap) + if ( EvalInterval != null ) this.intervalsString = List(EvalInterval) } def script = { for ( vcf <- vcfsToBeagle ) { - if ( ! TEST ) add(new MyEval(vcf)) -// for ( niter <- List(10) ) { + var bamGenotypes: File = null + if ( bam != null ) { + bamGenotypes = new File(swapExt(bam, ".bam", "_genotyped_at." + vcf.getName).getName) + add(new GenotypeBAMAtSites(bam, vcf, bamGenotypes)) + } + + if ( ! TEST ) add(new EvalPanelAtChipSites(vcf)) for ( useCalibrationCurve <- List(true, false) ) { -// for ( includeFilteredRecords <- List(true, false) ) { - for ( niter <- List(10, 25, 50) ) { + for ( niter <- List(10, 20, 50) ) { if ( ! TEST || (niter == 10 && ! useCalibrationCurve)) { val refine_out = swapExt(vcf, ".vcf", ".refined.niter_%d_cc_%b.vcf".format(niter, useCalibrationCurve)) - RefineGenotypes(vcf, refine_out, useCalibrationCurve, "niterations=%d".format(niter)) - add(new MyEval(refine_out)) + val refPanel = new ReferencePanelBuilder(vcf, refine_out, useCalibrationCurve, "niterations=%d".format(niter)) + refPanel.enqueueCommands() + + // start up VE + add(new EvalPanelAtChipSites(refine_out)) + + if ( bamGenotypes != null ) { + for ( fractionLeftOut <- fractionsLeftOut ) { + val bamGenotypesImputed = swapExt(bamGenotypes, ".vcf", "_flo_%.2f.imputed.vcf".format(fractionLeftOut)) + val args = "niterations=%d missing=\"%s\"".format(niter, MISSING_KEY) + val panelEval = new EvaluateReferencePanel(bamGenotypes, bamGenotypesImputed, refPanel, fractionLeftOut, args) + panelEval.enqueueCommands() + add(new EvalPanelAtBAMCalledSites(bamGenotypesImputed, bamGenotypes, panelEval.getBootstrap)) + } + } } } }