BeagleOutputToVCF now accepts an option to keep monomorphic sites. This is useful to genotype a single sample, where having AC=0 just means that the sample is hom-ref at the site.

ProduceBeagleInputWalker can optionally emit a beagle markers file, necessary to use the beagled reference panel for imputation.  Also supports the VQSR calibration curve idea that a site can be flagged as a certain FP, based on the VQSLOD field.  This allows us to have both continuous quality in the refinement of sites as well as hard filtering at some threshold so we don't end up with lots of sites with all 1/3 1/3 1/3 likelihoods for all samples (i.e., a definite FP site where we don't know anything about the samples). 

Added a new VariantsToBeagleUnphased walker that writes out a marker drive hard-call unphased genotypes file suitable for imputating missing genotypes with a reference panel with beagle.  Can optionally keep back a fraction of sites, marked as missing in the genotypes file, for assessment of imputation accuracy and power.  The bootstrap sites can be written to a separate VCF for assessment as well.

Finally, my general Queue script for creating and evaluating reference panels from VCF files.  Supports explicitly genotyping a BAM file at each panel SNP site, for assessment of imputation accuracy of a reference panel.  Lots of options for exploring the impact of the VQS likelihooods, multiple VCFs for constructing the reference panel, as well as fraction of sites left out in assessing the panel's power.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5467 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-03-18 03:08:38 +00:00
parent 9b8d41160b
commit abc7d1aef9
5 changed files with 531 additions and 91 deletions

View File

@ -68,6 +68,9 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
@Deprecated
protected String oldOutputArg;
@Argument(fullName="dont_mark_monomorphic_sites_as_filtered", shortName="keep_monomorphic", doc="If provided, we won't filter sites that beagle tags as monomorphic. Useful for imputing a sample's genotypes from a reference panel" ,required=false)
public boolean DONT_FILTER_MONOMORPHIC_SITES = false;
@Argument(fullName="no" +
"call_threshold", shortName="ncthr", doc="Threshold of confidence at which a genotype won't be called", required=false)
private double noCallThreshold = 0.0;
@ -304,7 +307,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
}
VariantContext filteredVC;
if ( beagleVarCounts > 0)
if ( beagleVarCounts > 0 || DONT_FILTER_MONOMORPHIC_SITES )
filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.filtersWereApplied() ? vc_input.getFilters() : null, vc_input.getAttributes());
else {
Set<String> removedFilters = vc_input.filtersWereApplied() ? new HashSet<String>(vc_input.getFilters()) : new HashSet<String>(1);

View File

@ -45,6 +45,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import java.io.File;
@ -64,6 +65,10 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
@Output(doc="File to which BEAGLE input should be written",required=true)
protected PrintStream beagleWriter = null;
@Output(doc="File to which BEAGLE markers should be written", shortName="markers", fullName = "markers", required = false)
protected PrintStream markers = null;
int markerCounter = 1;
@Hidden
@Input(doc="VQSqual calibration file", shortName = "cc", required=false)
protected File VQSRCalibrationFile = null;
@ -73,10 +78,6 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
@Argument(doc="VQSqual key", shortName = "vqskey", required=false)
protected String VQSLOD_KEY = "VQSqual";
// @Hidden
// @Argument(doc="Include filtered records", shortName = "ifr", fullName = "IncludeFilteredRecords", required=false)
// protected boolean includeFilteredRecords = false;
@Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false)
public double insertedNoCallRate = 0;
@Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false)
@ -92,12 +93,13 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
@Argument(fullName = "variant_genotype_ptrue", shortName = "varp", doc = "Flat probability prior to assign to variant (not validation) genotypes. Does not override GL field.", required = false)
public double variantPrior = 0.96;
private Set<String> samples = null;
private Set<String> BOOTSTRAP_FILTER = new HashSet<String>( Arrays.asList("bootstrap") );
private Random generator;
private int bootstrapSetSize = 0;
private int testSetSize = 0;
private CachingFormatter formatter = new CachingFormatter("%5.4f ", 100000);
private int certainFPs = 0;
public void initialize() {
generator = new Random();
@ -122,17 +124,15 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
}
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
if( tracker != null ) {
GenomeLoc loc = context.getLocation();
VariantContext variant_eval;
VariantContext validation_eval;
VariantContext variant_eval = tracker.getVariantContext(ref, ROD_NAME, null, loc, true);
VariantContext validation_eval = tracker.getVariantContext(ref,VALIDATION_ROD_NAME,null,loc, true);
variant_eval = tracker.getVariantContext(ref, ROD_NAME, null, loc, true);
validation_eval = tracker.getVariantContext(ref,VALIDATION_ROD_NAME,null,loc, true);
if ( goodSite(variant_eval,validation_eval) ) {
if ( useValidation(variant_eval,validation_eval, ref) ) {
writeBeagleOutput(validation_eval,variant_eval,true,validationPrior);
if ( useValidation(validation_eval, ref) ) {
writeBeagleOutput(validation_eval, variant_eval, true, validationPrior);
return 1;
} else {
if ( goodSite(variant_eval) ) {
writeBeagleOutput(variant_eval,validation_eval,false,variantPrior);
@ -141,7 +141,6 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
return 0;
}
}
return 1;
} else {
return 0;
}
@ -155,11 +154,23 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
}
public boolean goodSite(VariantContext v) {
return v != null && ! v.isFiltered() && v.isBiallelic() && v.hasGenotypes();
//return v != null && (includeFilteredRecords || ! v.isFiltered()) && v.isBiallelic() && v.hasGenotypes();
if ( canBeOutputToBeagle(v) ) {
if ( VQSRCalibrator != null && VQSRCalibrator.certainFalsePositive(VQSLOD_KEY, v) ) {
certainFPs++;
return false;
} else {
return true;
}
} else {
return false;
}
}
public boolean useValidation(VariantContext variant, VariantContext validation, ReferenceContext ref) {
public static boolean canBeOutputToBeagle(VariantContext v) {
return v != null && ! v.isFiltered() && v.isBiallelic() && v.hasGenotypes();
}
public boolean useValidation(VariantContext validation, ReferenceContext ref) {
if( goodSite(validation) ) {
// if using record keeps us below expected proportion, use it
logger.debug(String.format("boot: %d, test: %d, total: %d", bootstrapSetSize, testSetSize, bootstrapSetSize+testSetSize+1));
@ -167,7 +178,7 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
if ( bootstrapVCFOutput != null ) {
bootstrapVCFOutput.add(VariantContext.modifyFilters(validation, BOOTSTRAP_FILTER), ref.getBase() );
}
bootstrapSetSize ++;
bootstrapSetSize++;
return true;
} else {
if ( bootstrapVCFOutput != null ) {
@ -191,7 +202,9 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
GenomeLoc currentLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),preferredVC);
StringBuffer beagleOut = new StringBuffer();
beagleOut.append(String.format("%s:%d ",currentLoc.getContig(),currentLoc.getStart()));
String marker = String.format("%s:%d ",currentLoc.getContig(),currentLoc.getStart());
beagleOut.append(marker);
if ( markers != null ) markers.append(marker).append("\t").append(Integer.toString(markerCounter++)).append("\t");
for ( Allele allele : preferredVC.getAlleles() ) {
String bglPrintString;
if (allele.isNoCall() || allele.isNull())
@ -200,7 +213,9 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
bglPrintString = allele.getBaseString(); // get rid of * in case of reference allele
beagleOut.append(String.format("%s ", bglPrintString));
if ( markers != null ) markers.append(bglPrintString).append("\t");
}
if ( markers != null ) markers.append("\n");
Map<String,Genotype> preferredGenotypes = preferredVC.getGenotypes();
Map<String,Genotype> otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null;
@ -265,13 +280,15 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
}
private void writeSampleLikelihoods( StringBuffer out, VariantContext vc, double[] log10Likelihoods ) {
if ( VQSRCalibrator != null )
if ( VQSRCalibrator != null ) {
log10Likelihoods = VQSRCalibrator.includeErrorRateInLikelihoods(VQSLOD_KEY, vc, log10Likelihoods);
}
double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(log10Likelihoods);
// see if we need to randomly mask out genotype in this position.
for (double likeVal: normalizedLikelihoods) {
out.append(String.format("%5.4f ",likeVal));
out.append(formatter.format(likeVal));
// out.append(String.format("%5.4f ",likeVal));
}
}
@ -281,11 +298,13 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
}
public Integer reduce( Integer value, Integer sum ) {
return 0; // Nothing to do here
return value + sum; // count up the sites
}
public void onTraversalDone( Integer sum ) {
public void onTraversalDone( Integer includedSites ) {
logger.info("Sites included in beagle likelihoods file : " + includedSites);
logger.info(String.format("Certain false positive found from recalibration curve : %d (%.2f%%)",
certainFPs, (100.0 * certainFPs) / (Math.max(certainFPs + includedSites, 1))));
}
private void initializeVcfWriter() {
@ -301,4 +320,114 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
bootstrapVCFOutput.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames)));
}
public static class CachingFormatter {
private int maxCacheSize = 0;
private String format;
private LRUCache<Double, String> cache;
public String getFormat() {
return format;
}
public String format(double value) {
String f = cache.get(value);
if ( f == null ) {
f = String.format(format, value);
cache.put(value, f);
// if ( cache.usedEntries() < maxCacheSize ) {
// System.out.printf("CACHE size %d%n", cache.usedEntries());
// } else {
// System.out.printf("CACHE is full %f%n", value);
// }
// }
// } else {
// System.out.printf("CACHE hit %f%n", value);
// }
}
return f;
}
public CachingFormatter(String format, int maxCacheSize) {
this.maxCacheSize = maxCacheSize;
this.format = format;
this.cache = new LRUCache<Double, String>(maxCacheSize);
}
}
/**
* An LRU cache, based on <code>LinkedHashMap</code>.
*
* <p>
* This cache has a fixed maximum number of elements (<code>cacheSize</code>).
* If the cache is full and another entry is added, the LRU (least recently used) entry is dropped.
*
* <p>
* This class is thread-safe. All methods of this class are synchronized.
*
* <p>
* Author: Christian d'Heureuse, Inventec Informatik AG, Zurich, Switzerland<br>
* Multi-licensed: EPL / LGPL / GPL / AL / BSD.
*/
public static class LRUCache<K,V> {
private static final float hashTableLoadFactor = 0.75f;
private LinkedHashMap<K,V> map;
private int cacheSize;
/**
* Creates a new LRU cache.
* @param cacheSize the maximum number of entries that will be kept in this cache.
*/
public LRUCache (int cacheSize) {
this.cacheSize = cacheSize;
int hashTableCapacity = (int)Math.ceil(cacheSize / hashTableLoadFactor) + 1;
map = new LinkedHashMap<K,V>(hashTableCapacity, hashTableLoadFactor, true) {
// (an anonymous inner class)
private static final long serialVersionUID = 1;
@Override protected boolean removeEldestEntry (Map.Entry<K,V> eldest) {
return size() > LRUCache.this.cacheSize; }}; }
/**
* Retrieves an entry from the cache.<br>
* The retrieved entry becomes the MRU (most recently used) entry.
* @param key the key whose associated value is to be returned.
* @return the value associated to this key, or null if no value with this key exists in the cache.
*/
public synchronized V get (K key) {
return map.get(key); }
/**
* Adds an entry to this cache.
* The new entry becomes the MRU (most recently used) entry.
* If an entry with the specified key already exists in the cache, it is replaced by the new entry.
* If the cache is full, the LRU (least recently used) entry is removed from the cache.
* @param key the key with which the specified value is to be associated.
* @param value a value to be associated with the specified key.
*/
public synchronized void put (K key, V value) {
map.put (key, value); }
/**
* Clears the cache.
*/
public synchronized void clear() {
map.clear(); }
/**
* Returns the number of used entries in the cache.
* @return the number of entries currently in the cache.
*/
public synchronized int usedEntries() {
return map.size(); }
/**
* Returns a <code>Collection</code> that contains a copy of all cache entries.
* @return a <code>Collection</code> with a copy of the cache content.
*/
public synchronized Collection<Map.Entry<K,V>> getAll() {
return new ArrayList<Map.Entry<K,V>>(map.entrySet()); }
} // end class LRUCache
}

View File

@ -0,0 +1,179 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.beagle;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.VCFHeader;
import org.broad.tribble.vcf.VCFHeaderLine;
import org.broad.tribble.vcf.VCFWriter;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import java.io.PrintStream;
import java.util.*;
/**
* Produces an input file to Beagle imputation engine, listing unphased, hard-called genotypes for a single sample
* in input variant file. Will additional hold back a fraction of the sites for evaluation, marking the
* genotypes at that sites as missing, and writing the truth of these sites to a second VCF file
*/
@Requires(value={},referenceMetaData=@RMD(name= VariantsToBeagleUnphasedWalker.ROD_NAME, type=VariantContext.class))
public class VariantsToBeagleUnphasedWalker extends RodWalker<Integer, Integer> {
public static final String ROD_NAME = "variant";
@Output(doc="File to which BEAGLE unphased genotypes should be written",required=true)
protected PrintStream beagleWriter = null;
@Argument(fullName = "bootstrap_fraction", shortName = "bs", doc = "Proportion of records to be used in bootstrap set", required = false)
public double bootstrap = 0.0;
@Argument(fullName = "bootstrap_vcf",shortName = "bsvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false)
VCFWriter bootstrapVCFOutput = null;
@Argument(fullName = "missing", shortName = "missing", doc = "String to identify missing data in beagle output", required = false)
public String MISSING = "?";
private Set<String> samples = null;
private int bootstrapSetSize = 0;
private int testSetSize = 0;
public void initialize() {
samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(ROD_NAME));
beagleWriter.print("I marker alleleA alleleB");
for ( String sample : samples )
beagleWriter.print(String.format(" %s %s", sample, sample));
beagleWriter.println();
if ( bootstrap < 0.0 | bootstrap > 1.0 )
throw new UserException.BadArgumentValue("bootstrap", "Bootstrap value must be fraction between 0 and 1");
if ( bootstrapVCFOutput != null ) {
Set<VCFHeaderLine> hInfo = VCFUtils.getHeaderFields(getToolkit());
bootstrapVCFOutput.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit())));
}
}
/**
* Iterate over each site, emitting the BEAGLE unphased genotypes file format
* @param tracker
* @param ref
* @param context
* @return
*/
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
if( tracker != null ) {
GenomeLoc loc = context.getLocation();
VariantContext vc = tracker.getVariantContext(ref, ROD_NAME, null, loc, true);
if ( ProduceBeagleInputWalker.canBeOutputToBeagle(vc) ) {
// do we want to hold back this site?
boolean makeMissing = dropSite(vc);
// if we are holding it back and we are writing a bootstrap VCF, write it out
if ( makeMissing && bootstrapVCFOutput != null ) {
bootstrapVCFOutput.add(vc, ref.getBase());
}
// regardless, all sites are written to the unphased genotypes file, marked as missing if appropriate
writeUnphasedBeagleOutput(vc, makeMissing);
}
}
return 0;
}
/**
* Do we want to hold back this site for bootstrap? Considers the bootstrap fraction member variable
*
* @param vc
* @return
*/
public boolean dropSite(VariantContext vc) {
if ( (bootstrapSetSize+1.0)/(1.0+bootstrapSetSize+testSetSize) <= bootstrap ) {
bootstrapSetSize++;
return true;
} else {
testSetSize++;
return false;
}
}
public void writeUnphasedBeagleOutput(VariantContext vc, boolean makeMissing) {
GenomeLoc currentLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),vc);
StringBuffer beagleOut = new StringBuffer();
String marker = String.format("%s:%d ",currentLoc.getContig(), currentLoc.getStart());
beagleOut.append("M ").append(marker);
// write out the alleles at this site
for ( Allele allele : vc.getAlleles() ) {
beagleOut.append(allele.isNoCall() || allele.isNull() ? "-" : allele.getBaseString()).append(" ");
}
// write out sample level genotypes
for ( String sample : samples ) {
Genotype genotype = vc.getGenotype(sample);
if ( ! makeMissing && genotype.isCalled() ) {
addAlleles(beagleOut, genotype);
} else {
addAlleles(beagleOut, MISSING, MISSING);
}
}
beagleWriter.println(beagleOut.toString());
}
private void addAlleles(StringBuffer buf, Genotype g) {
addAlleles(buf, g.getAllele(0).getBaseString(), g.getAllele(1).getBaseString());
}
private void addAlleles(StringBuffer buf, String a, String b) {
buf.append(a).append(" ").append(b);
}
public Integer reduceInit() { return 0; }
public Integer reduce( Integer value, Integer sum ) { return value + sum; }
public void onTraversalDone( Integer includedSites ) {
logger.info("Sites included in beagle genotypes file : " + includedSites);
}
}

View File

@ -22,6 +22,7 @@ import java.util.List;
public class VQSRCalibrationCurve {
private final static boolean DEBUG = false;
List<VQSRRange> points;
public static final double CERTAIN_FALSE_POSITIVE = -1;
private static class VQSRRange {
double start, stop, truePositiveRate;
@ -52,7 +53,9 @@ public class VQSRCalibrationCurve {
for ( String line : new XReadLines(source).readLines() ) {
if ( ! line.trim().isEmpty() ) {
String[] parts = line.split("\\s+");
points.add(new VQSRRange(Double.parseDouble(parts[0]), Double.parseDouble(parts[1]), 1 - Double.parseDouble(parts[2])));
double fpRate = Double.parseDouble(parts[2]);
double tpRate = fpRate >= 1.0 ? CERTAIN_FALSE_POSITIVE : 1.0 - fpRate;
points.add(new VQSRRange(Double.parseDouble(parts[0]), Double.parseDouble(parts[1]), tpRate));
}
}
} catch ( FileNotFoundException e ) {
@ -70,6 +73,11 @@ public class VQSRCalibrationCurve {
this.points = points;
}
public boolean certainFalsePositive(String VQSRQualKey, VariantContext vc) {
return probTrueVariant(VQSRQualKey, vc) == CERTAIN_FALSE_POSITIVE;
}
public double probTrueVariant(double VQSRqual) {
for ( VQSRRange r : points ) {
if ( VQSRqual <= r.getStart() && VQSRqual > r.getStop() )
@ -90,21 +98,33 @@ public class VQSRCalibrationCurve {
}
}
/**
* Returns a likelihoods vector adjusted by the probability that the site is an error. Returns a
* null vector if the probability of the site being real is 0.0
* @param VQSRQualKey
* @param vc
* @param log10Likelihoods
* @return
*/
public double[] includeErrorRateInLikelihoods(String VQSRQualKey, VariantContext vc, double[] log10Likelihoods) {
double[] updated = new double[log10Likelihoods.length];
double alpha = probTrueVariant(VQSRQualKey, vc);
double qual = vc.getAttributeAsDouble(VQSRQualKey); // todo -- remove me
double noInfoPr = 1.0 / 3;
if ( DEBUG ) System.out.printf("------------------------------%n");
for ( int i = 0; i < log10Likelihoods.length; i++) {
double p = Math.pow(10, log10Likelihoods[i]);
double q = alpha * p + (1-alpha) * noInfoPr;
if ( DEBUG ) System.out.printf(" vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", qual, p, alpha, q);
updated[i] = Math.log10(q);
}
return updated;
if ( alpha == CERTAIN_FALSE_POSITIVE )
return null;
else {
double noInfoPr = 1.0 / 3;
if ( DEBUG ) System.out.printf("------------------------------%n");
for ( int i = 0; i < log10Likelihoods.length; i++) {
double p = Math.pow(10, log10Likelihoods[i]);
double q = alpha * p + (1-alpha) * noInfoPr;
if ( DEBUG ) System.out.printf(" vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", vc.getAttributeAsDouble(VQSRQualKey), p, alpha, q);
updated[i] = Math.log10(q);
}
return updated;
}
}

View File

@ -20,9 +20,18 @@ class RefineGenotypesWithBeagle extends QScript {
@Argument(doc="X",required=false,shortName="cc") var CALIBRATION_CURVE: File = new File("vqsr.calibration.curve")
@Argument(doc="X",required=false,shortName="test") var TEST: Boolean = false
@Argument(doc="Tmpdir",required=false,shortName="tmpdir") var TMPDIR: File = new File("./")
// assessing imputation performance
@Argument(doc="BAM File for genotyping",required=false, shortName="bam") var bam: File = null
@Argument(doc="Percent of sites that should be left out of BAM VCF to assess imputation", required=false, shortName="flo")
var fractionsLeftOut: List[Double] = List(0.1, 0.2, 0.5, 0.9)
// todo -- this might be best to think about in a different unit -- marker density per bp
val MISSING_KEY = "?"
val HM3_VCF: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf")
val OMNI_VCF: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf")
val dbSNP_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132_b37.leftAligned.vcf"
trait GATKArgs extends CommandLineGATK {
this.reference_sequence = qscript.reference
@ -30,94 +39,194 @@ class RefineGenotypesWithBeagle extends QScript {
this.memoryLimit = Some(2)
}
class GunzipFile(in: File, out:File ) extends CommandLineFunction {
@Input(doc="file to gunzip") var inp = in
@Output(doc="file to gunzip to") var outp = out
// --------------------------------------------------------------------------------
//
// GENOTYPING SPECIFIC SITES IN A BAM FILE
//
// --------------------------------------------------------------------------------
def commandLine = "gunzip -c %s > %s".format(inp.getAbsolutePath, outp.getAbsolutePath)
}
class GenotypeBAMAtSites(@Input bam: File, @Input sitesVCF: File, @Output genotypesVCF: File) extends UnifiedGenotyper with GATKArgs {
this.input_file = List(bam)
this.o = genotypesVCF
this.stand_call_conf = Some(0.0)
this.out_mode = Some(org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES)
this.gt_mode = Some(org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES)
this.rodBind :+= new RodBind("alleles","VCF",sitesVCF)
class BeagleRefinement(moreBeagleArgs: String = "") extends CommandLineFunction {
@Input(doc="The beagle input file") var beagleInput: File = _
var beagleOutputBase: String = _
var beagleMemoryGigs: Int = BEAGLE_MEM_IN_GB
// we only want chromosome counts annotations
this.BTI = "alleles"
this.G = List("none")
this.A :+= "ChromosomeCounts"
this.nsl = true
/**
* Note: These get set
*/
@Output(doc="The beagle phased file") var beaglePhasedFile: File = _
@Output(doc="The beagle likelihood file") var beagleLikelihoods: File = _
@Output(doc="The beagle r2 file") var beagleRSquared: File = _
var beagleOutputDir: String = _
def freezeOutputs = {
if ( beagleInput.getParent == null ) {
beagleOutputDir = ""
} else {
beagleOutputDir = beagleInput.getParent
}
beaglePhasedFile = new File(beagleOutputDir + beagleOutputBase+"."+beagleInput.getName+".phased.gz")
beagleLikelihoods = new File(beagleOutputDir + beagleOutputBase+"."+beagleInput.getName+".gprobs.gz")
beagleRSquared = new File(beagleOutputDir + beagleOutputBase +"."+beagleInput.getName+".r2")
// make sure we have the right intervals
if ( interval != null ) {
this.intervalsString = List(interval)
this.BTIMR = Some(org.broadinstitute.sting.utils.interval.IntervalSetRule.INTERSECTION)
}
def commandLine = "java -Djava.io.tmpdir=%s -Xmx%dg -jar %s like=%s %s out=%s".format(beagleInput.getParent,beagleMemoryGigs,beagleJar,beagleInput.getAbsolutePath,moreBeagleArgs,beagleOutputBase)
}
def RefineGenotypes(inputVCF: File, outputVCF: File, useCalibrationCurve: Boolean, moreBeagleArgs: String = "") = {
var beagleInput = new ProduceBeagleInput with GATKArgs
// --------------------------------------------------------------------------------
//
// BEAGLE COMMANDS
//
// --------------------------------------------------------------------------------
class BeagleCommand(outputBase: String) extends CommandLineFunction {
this.memoryLimit = Some(BEAGLE_MEM_IN_GB)
// Note: These get set
@Output val beaglePhasedFile: File = new File(outputBase +".phased.gz")
@Output val beagleLikelihoods: File = new File(outputBase +".gprobs.gz")
@Output val beagleRSquared: File = new File(outputBase +".r2")
def commandLine = "java -Djava.io.tmpdir=%s -Xmx%dg -jar %s out=ignore.me omitprefix=true".format(TMPDIR, BEAGLE_MEM_IN_GB, beagleJar)
}
class RefineGenotypesWithBeagle(@Input beagleInput: File, moreBeagleArgs: String = "")
extends BeagleCommand(beagleInput.getName) {
def myArgs = " like=%s %s".format(beagleInput.getAbsolutePath, moreBeagleArgs)
override def commandLine = super.commandLine + myArgs
}
class ImputeMissingGenotypesWithReferencePanel(@Input evalBeagle: File,
@Input phasedBeagleFile: File,
@Input markers: File,
moreBeagleArgs: String = "")
extends BeagleCommand(evalBeagle.getName) {
def myArgs = " unphased=%s phased=%s markers=%s %s".format(evalBeagle.getAbsolutePath,
phasedBeagleFile.getAbsolutePath, markers.getAbsolutePath, moreBeagleArgs)
override def commandLine = super.commandLine + myArgs
}
class GunzipFile(@Input val in: File, @Output val out: File) extends CommandLineFunction {
def commandLine = "gunzip -c %s > %s".format(in.getAbsolutePath, out.getAbsolutePath)
}
// --------------------------------------------------------------------------------
//
// CREATING AND EVALUATING REFERENCE PANELS
//
// --------------------------------------------------------------------------------
class ReferencePanelBuilder(inputVCF: File, outputVCF: File, useCalibrationCurve: Boolean, moreBeagleArgs: String = "") {
val beagleInput = new ProduceBeagleInput with GATKArgs
if ( interval != null ) beagleInput.intervalsString = List(interval)
beagleInput.variantVCF = inputVCF
beagleInput.out = swapExt(outputVCF,".vcf",".beagle")
if ( useCalibrationCurve ) beagleInput.cc = CALIBRATION_CURVE
beagleInput.markers = swapExt(outputVCF, ".vcf", ".markers.txt")
beagleInput.isIntermediate = true
var refine = new BeagleRefinement(moreBeagleArgs)
refine.beagleInput = beagleInput.out
refine.beagleOutputBase = outputVCF.getName + ".bout"
refine.beagleMemoryGigs = BEAGLE_MEM_IN_GB
refine.memoryLimit = Some(BEAGLE_MEM_IN_GB)
refine.freezeOutputs
val refine = new RefineGenotypesWithBeagle(beagleInput.out, moreBeagleArgs)
var unzipPhased = new GunzipFile(refine.beaglePhasedFile,swapExt(refine.beaglePhasedFile,".gz",".bgl"))
var unzipProbs = new GunzipFile(refine.beagleLikelihoods,swapExt(refine.beagleLikelihoods,".gz",".bgl"))
// unzipPhased.isIntermediate = true
// unzipProbs.isIntermediate = true
val unzipPhased = new GunzipFile(refine.beaglePhasedFile,swapExt(refine.beaglePhasedFile,".gz",".bgl"))
val unzipProbs = new GunzipFile(refine.beagleLikelihoods,swapExt(refine.beagleLikelihoods,".gz",".bgl"))
//unzipPhased.isIntermediate = true
//unzipProbs.isIntermediate = true
var vcfConvert = new BeagleOutputToVCF with GATKArgs
val vcfConvert = new BeagleOutputToVCF with GATKArgs
vcfConvert.variantVCF = inputVCF
vcfConvert.rodBind :+= new RodBind("beagleR2","BEAGLE",refine.beagleRSquared)
vcfConvert.rodBind :+= new RodBind("beaglePhased","BEAGLE",unzipPhased.outp)
vcfConvert.rodBind :+= new RodBind("beagleProbs","BEAGLE",unzipProbs.outp)
vcfConvert.rodBind :+= new RodBind("beaglePhased","BEAGLE",unzipPhased.out)
vcfConvert.rodBind :+= new RodBind("beagleProbs","BEAGLE",unzipProbs.out)
vcfConvert.out = outputVCF
for ( cmd: CommandLineFunction <- List(beagleInput, refine, unzipPhased, unzipProbs, vcfConvert) )
add(cmd)
def getMarkers = beagleInput.markers
def getPanelPhasedHaplotypes = refine.beaglePhasedFile
def getPanelVCF = vcfConvert.out
def enqueueCommands() = {
for ( cmd: CommandLineFunction <- List(beagleInput, refine, unzipPhased, unzipProbs, vcfConvert) )
add(cmd)
}
}
class MyEval(@Input(doc="Evaluation file") eval: File) extends VariantEval with GATKArgs {
class EvaluateReferencePanel(@Input evalVCF: File,
@Output outputVCF: File,
panel: ReferencePanelBuilder,
percentLeftOut: Double,
moreBeagleArgs: String = "") {
val evalBeagle = new VariantsToBeagleUnphased with GATKArgs
if ( interval != null ) evalBeagle.intervalsString = List(interval)
evalBeagle.variantVCF = evalVCF
evalBeagle.out = swapExt(outputVCF,".vcf",".unphased.beagle")
evalBeagle.bs = Some(percentLeftOut)
evalBeagle.bsvcf = swapExt(outputVCF,".vcf",".missing.vcf")
evalBeagle.missing = MISSING_KEY
//evalBeagle.isIntermediate = true
val refine = new ImputeMissingGenotypesWithReferencePanel(evalBeagle.out, panel.getPanelPhasedHaplotypes, panel.getMarkers, moreBeagleArgs)
val unzipPhased = new GunzipFile(refine.beaglePhasedFile,swapExt(refine.beaglePhasedFile,".gz",".bgl"))
val unzipProbs = new GunzipFile(refine.beagleLikelihoods,swapExt(refine.beagleLikelihoods,".gz",".bgl"))
//unzipPhased.isIntermediate = true
//unzipProbs.isIntermediate = true
val vcfConvert = new BeagleOutputToVCF with GATKArgs
vcfConvert.variantVCF = evalVCF
vcfConvert.rodBind :+= new RodBind("beagleR2","BEAGLE",refine.beagleRSquared)
vcfConvert.rodBind :+= new RodBind("beaglePhased","BEAGLE",unzipPhased.out)
vcfConvert.rodBind :+= new RodBind("beagleProbs","BEAGLE",unzipProbs.out)
vcfConvert.out = outputVCF
vcfConvert.keep_monomorphic = true
def getBootstrap: File = evalBeagle.bsvcf
def enqueueCommands() = {
for ( cmd: CommandLineFunction <- List(evalBeagle, refine, unzipPhased, unzipProbs, vcfConvert) )
add(cmd)
}
}
class EvalPanelAtChipSites(@Input eval: File) extends VariantEval with GATKArgs {
this.noST = true
this.evalModule :+= "GenotypeConcordance"
this.o = swapExt(eval, ".vcf", ".vcf.eval")
this.rodBind :+= RodBind("eval", "VCF", eval)
this.rodBind :+= RodBind("comp_hm3", "VCF", HM3_VCF)
this.rodBind :+= RodBind("comp_omni", "VCF", OMNI_VCF)
if ( EvalInterval != null ) {
Console.printf("EvalInterval " + EvalInterval)
this.intervalsString = List(EvalInterval)
}
if ( EvalInterval != null ) this.intervalsString = List(EvalInterval)
}
class EvalPanelAtBAMCalledSites(@Input imputedVCF: File, @Input bamGenotypes: File, @Input bootstrap: File) extends VariantEval with GATKArgs {
this.evalModule :+= "GenotypeConcordance"
this.o = swapExt(imputedVCF, ".vcf", ".vcf.eval")
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP_b37)
this.rodBind :+= RodBind("eval", "VCF", imputedVCF)
this.rodBind :+= RodBind("comp_bam_genotypes", "VCF", bamGenotypes)
this.rodBind :+= RodBind("comp_bootstrap", "VCF", bootstrap)
if ( EvalInterval != null ) this.intervalsString = List(EvalInterval)
}
def script = {
for ( vcf <- vcfsToBeagle ) {
if ( ! TEST ) add(new MyEval(vcf))
// for ( niter <- List(10) ) {
var bamGenotypes: File = null
if ( bam != null ) {
bamGenotypes = new File(swapExt(bam, ".bam", "_genotyped_at." + vcf.getName).getName)
add(new GenotypeBAMAtSites(bam, vcf, bamGenotypes))
}
if ( ! TEST ) add(new EvalPanelAtChipSites(vcf))
for ( useCalibrationCurve <- List(true, false) ) {
// for ( includeFilteredRecords <- List(true, false) ) {
for ( niter <- List(10, 25, 50) ) {
for ( niter <- List(10, 20, 50) ) {
if ( ! TEST || (niter == 10 && ! useCalibrationCurve)) {
val refine_out = swapExt(vcf, ".vcf", ".refined.niter_%d_cc_%b.vcf".format(niter, useCalibrationCurve))
RefineGenotypes(vcf, refine_out, useCalibrationCurve, "niterations=%d".format(niter))
add(new MyEval(refine_out))
val refPanel = new ReferencePanelBuilder(vcf, refine_out, useCalibrationCurve, "niterations=%d".format(niter))
refPanel.enqueueCommands()
// start up VE
add(new EvalPanelAtChipSites(refine_out))
if ( bamGenotypes != null ) {
for ( fractionLeftOut <- fractionsLeftOut ) {
val bamGenotypesImputed = swapExt(bamGenotypes, ".vcf", "_flo_%.2f.imputed.vcf".format(fractionLeftOut))
val args = "niterations=%d missing=\"%s\"".format(niter, MISSING_KEY)
val panelEval = new EvaluateReferencePanel(bamGenotypes, bamGenotypesImputed, refPanel, fractionLeftOut, args)
panelEval.enqueueCommands()
add(new EvalPanelAtBAMCalledSites(bamGenotypesImputed, bamGenotypes, panelEval.getBootstrap))
}
}
}
}
}