BeagleOutputToVCF now accepts an option to keep monomorphic sites. This is useful to genotype a single sample, where having AC=0 just means that the sample is hom-ref at the site.

ProduceBeagleInputWalker can optionally emit a beagle markers file, necessary to use the beagled reference panel for imputation.  Also supports the VQSR calibration curve idea that a site can be flagged as a certain FP, based on the VQSLOD field.  This allows us to have both continuous quality in the refinement of sites as well as hard filtering at some threshold so we don't end up with lots of sites with all 1/3 1/3 1/3 likelihoods for all samples (i.e., a definite FP site where we don't know anything about the samples). 

Added a new VariantsToBeagleUnphased walker that writes out a marker drive hard-call unphased genotypes file suitable for imputating missing genotypes with a reference panel with beagle.  Can optionally keep back a fraction of sites, marked as missing in the genotypes file, for assessment of imputation accuracy and power.  The bootstrap sites can be written to a separate VCF for assessment as well.

Finally, my general Queue script for creating and evaluating reference panels from VCF files.  Supports explicitly genotyping a BAM file at each panel SNP site, for assessment of imputation accuracy of a reference panel.  Lots of options for exploring the impact of the VQS likelihooods, multiple VCFs for constructing the reference panel, as well as fraction of sites left out in assessing the panel's power.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5467 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-03-18 03:08:38 +00:00
parent 9b8d41160b
commit abc7d1aef9
5 changed files with 531 additions and 91 deletions

View File

@ -68,6 +68,9 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
@Deprecated @Deprecated
protected String oldOutputArg; protected String oldOutputArg;
@Argument(fullName="dont_mark_monomorphic_sites_as_filtered", shortName="keep_monomorphic", doc="If provided, we won't filter sites that beagle tags as monomorphic. Useful for imputing a sample's genotypes from a reference panel" ,required=false)
public boolean DONT_FILTER_MONOMORPHIC_SITES = false;
@Argument(fullName="no" + @Argument(fullName="no" +
"call_threshold", shortName="ncthr", doc="Threshold of confidence at which a genotype won't be called", required=false) "call_threshold", shortName="ncthr", doc="Threshold of confidence at which a genotype won't be called", required=false)
private double noCallThreshold = 0.0; private double noCallThreshold = 0.0;
@ -304,7 +307,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
} }
VariantContext filteredVC; VariantContext filteredVC;
if ( beagleVarCounts > 0) if ( beagleVarCounts > 0 || DONT_FILTER_MONOMORPHIC_SITES )
filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.filtersWereApplied() ? vc_input.getFilters() : null, vc_input.getAttributes()); filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.filtersWereApplied() ? vc_input.getFilters() : null, vc_input.getAttributes());
else { else {
Set<String> removedFilters = vc_input.filtersWereApplied() ? new HashSet<String>(vc_input.getFilters()) : new HashSet<String>(1); Set<String> removedFilters = vc_input.filtersWereApplied() ? new HashSet<String>(vc_input.getFilters()) : new HashSet<String>(1);

View File

@ -45,6 +45,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.vcf.VCFUtils; import org.broadinstitute.sting.utils.vcf.VCFUtils;
import java.io.File; import java.io.File;
@ -64,6 +65,10 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
@Output(doc="File to which BEAGLE input should be written",required=true) @Output(doc="File to which BEAGLE input should be written",required=true)
protected PrintStream beagleWriter = null; protected PrintStream beagleWriter = null;
@Output(doc="File to which BEAGLE markers should be written", shortName="markers", fullName = "markers", required = false)
protected PrintStream markers = null;
int markerCounter = 1;
@Hidden @Hidden
@Input(doc="VQSqual calibration file", shortName = "cc", required=false) @Input(doc="VQSqual calibration file", shortName = "cc", required=false)
protected File VQSRCalibrationFile = null; protected File VQSRCalibrationFile = null;
@ -73,10 +78,6 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
@Argument(doc="VQSqual key", shortName = "vqskey", required=false) @Argument(doc="VQSqual key", shortName = "vqskey", required=false)
protected String VQSLOD_KEY = "VQSqual"; protected String VQSLOD_KEY = "VQSqual";
// @Hidden
// @Argument(doc="Include filtered records", shortName = "ifr", fullName = "IncludeFilteredRecords", required=false)
// protected boolean includeFilteredRecords = false;
@Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false) @Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false)
public double insertedNoCallRate = 0; public double insertedNoCallRate = 0;
@Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false) @Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false)
@ -92,12 +93,13 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
@Argument(fullName = "variant_genotype_ptrue", shortName = "varp", doc = "Flat probability prior to assign to variant (not validation) genotypes. Does not override GL field.", required = false) @Argument(fullName = "variant_genotype_ptrue", shortName = "varp", doc = "Flat probability prior to assign to variant (not validation) genotypes. Does not override GL field.", required = false)
public double variantPrior = 0.96; public double variantPrior = 0.96;
private Set<String> samples = null; private Set<String> samples = null;
private Set<String> BOOTSTRAP_FILTER = new HashSet<String>( Arrays.asList("bootstrap") ); private Set<String> BOOTSTRAP_FILTER = new HashSet<String>( Arrays.asList("bootstrap") );
private Random generator; private Random generator;
private int bootstrapSetSize = 0; private int bootstrapSetSize = 0;
private int testSetSize = 0; private int testSetSize = 0;
private CachingFormatter formatter = new CachingFormatter("%5.4f ", 100000);
private int certainFPs = 0;
public void initialize() { public void initialize() {
generator = new Random(); generator = new Random();
@ -122,17 +124,15 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
} }
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
if( tracker != null ) { if( tracker != null ) {
GenomeLoc loc = context.getLocation(); GenomeLoc loc = context.getLocation();
VariantContext variant_eval; VariantContext variant_eval = tracker.getVariantContext(ref, ROD_NAME, null, loc, true);
VariantContext validation_eval; VariantContext validation_eval = tracker.getVariantContext(ref,VALIDATION_ROD_NAME,null,loc, true);
variant_eval = tracker.getVariantContext(ref, ROD_NAME, null, loc, true);
validation_eval = tracker.getVariantContext(ref,VALIDATION_ROD_NAME,null,loc, true);
if ( goodSite(variant_eval,validation_eval) ) { if ( goodSite(variant_eval,validation_eval) ) {
if ( useValidation(variant_eval,validation_eval, ref) ) { if ( useValidation(validation_eval, ref) ) {
writeBeagleOutput(validation_eval,variant_eval,true,validationPrior); writeBeagleOutput(validation_eval, variant_eval, true, validationPrior);
return 1;
} else { } else {
if ( goodSite(variant_eval) ) { if ( goodSite(variant_eval) ) {
writeBeagleOutput(variant_eval,validation_eval,false,variantPrior); writeBeagleOutput(variant_eval,validation_eval,false,variantPrior);
@ -141,7 +141,6 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
return 0; return 0;
} }
} }
return 1;
} else { } else {
return 0; return 0;
} }
@ -155,11 +154,23 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
} }
public boolean goodSite(VariantContext v) { public boolean goodSite(VariantContext v) {
return v != null && ! v.isFiltered() && v.isBiallelic() && v.hasGenotypes(); if ( canBeOutputToBeagle(v) ) {
//return v != null && (includeFilteredRecords || ! v.isFiltered()) && v.isBiallelic() && v.hasGenotypes(); if ( VQSRCalibrator != null && VQSRCalibrator.certainFalsePositive(VQSLOD_KEY, v) ) {
certainFPs++;
return false;
} else {
return true;
}
} else {
return false;
}
} }
public boolean useValidation(VariantContext variant, VariantContext validation, ReferenceContext ref) { public static boolean canBeOutputToBeagle(VariantContext v) {
return v != null && ! v.isFiltered() && v.isBiallelic() && v.hasGenotypes();
}
public boolean useValidation(VariantContext validation, ReferenceContext ref) {
if( goodSite(validation) ) { if( goodSite(validation) ) {
// if using record keeps us below expected proportion, use it // if using record keeps us below expected proportion, use it
logger.debug(String.format("boot: %d, test: %d, total: %d", bootstrapSetSize, testSetSize, bootstrapSetSize+testSetSize+1)); logger.debug(String.format("boot: %d, test: %d, total: %d", bootstrapSetSize, testSetSize, bootstrapSetSize+testSetSize+1));
@ -167,7 +178,7 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
if ( bootstrapVCFOutput != null ) { if ( bootstrapVCFOutput != null ) {
bootstrapVCFOutput.add(VariantContext.modifyFilters(validation, BOOTSTRAP_FILTER), ref.getBase() ); bootstrapVCFOutput.add(VariantContext.modifyFilters(validation, BOOTSTRAP_FILTER), ref.getBase() );
} }
bootstrapSetSize ++; bootstrapSetSize++;
return true; return true;
} else { } else {
if ( bootstrapVCFOutput != null ) { if ( bootstrapVCFOutput != null ) {
@ -191,7 +202,9 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
GenomeLoc currentLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),preferredVC); GenomeLoc currentLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),preferredVC);
StringBuffer beagleOut = new StringBuffer(); StringBuffer beagleOut = new StringBuffer();
beagleOut.append(String.format("%s:%d ",currentLoc.getContig(),currentLoc.getStart())); String marker = String.format("%s:%d ",currentLoc.getContig(),currentLoc.getStart());
beagleOut.append(marker);
if ( markers != null ) markers.append(marker).append("\t").append(Integer.toString(markerCounter++)).append("\t");
for ( Allele allele : preferredVC.getAlleles() ) { for ( Allele allele : preferredVC.getAlleles() ) {
String bglPrintString; String bglPrintString;
if (allele.isNoCall() || allele.isNull()) if (allele.isNoCall() || allele.isNull())
@ -200,7 +213,9 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
bglPrintString = allele.getBaseString(); // get rid of * in case of reference allele bglPrintString = allele.getBaseString(); // get rid of * in case of reference allele
beagleOut.append(String.format("%s ", bglPrintString)); beagleOut.append(String.format("%s ", bglPrintString));
if ( markers != null ) markers.append(bglPrintString).append("\t");
} }
if ( markers != null ) markers.append("\n");
Map<String,Genotype> preferredGenotypes = preferredVC.getGenotypes(); Map<String,Genotype> preferredGenotypes = preferredVC.getGenotypes();
Map<String,Genotype> otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null; Map<String,Genotype> otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null;
@ -265,13 +280,15 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
} }
private void writeSampleLikelihoods( StringBuffer out, VariantContext vc, double[] log10Likelihoods ) { private void writeSampleLikelihoods( StringBuffer out, VariantContext vc, double[] log10Likelihoods ) {
if ( VQSRCalibrator != null ) if ( VQSRCalibrator != null ) {
log10Likelihoods = VQSRCalibrator.includeErrorRateInLikelihoods(VQSLOD_KEY, vc, log10Likelihoods); log10Likelihoods = VQSRCalibrator.includeErrorRateInLikelihoods(VQSLOD_KEY, vc, log10Likelihoods);
}
double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(log10Likelihoods); double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(log10Likelihoods);
// see if we need to randomly mask out genotype in this position. // see if we need to randomly mask out genotype in this position.
for (double likeVal: normalizedLikelihoods) { for (double likeVal: normalizedLikelihoods) {
out.append(String.format("%5.4f ",likeVal)); out.append(formatter.format(likeVal));
// out.append(String.format("%5.4f ",likeVal));
} }
} }
@ -281,11 +298,13 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
} }
public Integer reduce( Integer value, Integer sum ) { public Integer reduce( Integer value, Integer sum ) {
return 0; // Nothing to do here return value + sum; // count up the sites
} }
public void onTraversalDone( Integer sum ) { public void onTraversalDone( Integer includedSites ) {
logger.info("Sites included in beagle likelihoods file : " + includedSites);
logger.info(String.format("Certain false positive found from recalibration curve : %d (%.2f%%)",
certainFPs, (100.0 * certainFPs) / (Math.max(certainFPs + includedSites, 1))));
} }
private void initializeVcfWriter() { private void initializeVcfWriter() {
@ -301,4 +320,114 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
bootstrapVCFOutput.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames))); bootstrapVCFOutput.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames)));
} }
public static class CachingFormatter {
private int maxCacheSize = 0;
private String format;
private LRUCache<Double, String> cache;
public String getFormat() {
return format;
}
public String format(double value) {
String f = cache.get(value);
if ( f == null ) {
f = String.format(format, value);
cache.put(value, f);
// if ( cache.usedEntries() < maxCacheSize ) {
// System.out.printf("CACHE size %d%n", cache.usedEntries());
// } else {
// System.out.printf("CACHE is full %f%n", value);
// }
// }
// } else {
// System.out.printf("CACHE hit %f%n", value);
// }
}
return f;
}
public CachingFormatter(String format, int maxCacheSize) {
this.maxCacheSize = maxCacheSize;
this.format = format;
this.cache = new LRUCache<Double, String>(maxCacheSize);
}
}
/**
* An LRU cache, based on <code>LinkedHashMap</code>.
*
* <p>
* This cache has a fixed maximum number of elements (<code>cacheSize</code>).
* If the cache is full and another entry is added, the LRU (least recently used) entry is dropped.
*
* <p>
* This class is thread-safe. All methods of this class are synchronized.
*
* <p>
* Author: Christian d'Heureuse, Inventec Informatik AG, Zurich, Switzerland<br>
* Multi-licensed: EPL / LGPL / GPL / AL / BSD.
*/
public static class LRUCache<K,V> {
private static final float hashTableLoadFactor = 0.75f;
private LinkedHashMap<K,V> map;
private int cacheSize;
/**
* Creates a new LRU cache.
* @param cacheSize the maximum number of entries that will be kept in this cache.
*/
public LRUCache (int cacheSize) {
this.cacheSize = cacheSize;
int hashTableCapacity = (int)Math.ceil(cacheSize / hashTableLoadFactor) + 1;
map = new LinkedHashMap<K,V>(hashTableCapacity, hashTableLoadFactor, true) {
// (an anonymous inner class)
private static final long serialVersionUID = 1;
@Override protected boolean removeEldestEntry (Map.Entry<K,V> eldest) {
return size() > LRUCache.this.cacheSize; }}; }
/**
* Retrieves an entry from the cache.<br>
* The retrieved entry becomes the MRU (most recently used) entry.
* @param key the key whose associated value is to be returned.
* @return the value associated to this key, or null if no value with this key exists in the cache.
*/
public synchronized V get (K key) {
return map.get(key); }
/**
* Adds an entry to this cache.
* The new entry becomes the MRU (most recently used) entry.
* If an entry with the specified key already exists in the cache, it is replaced by the new entry.
* If the cache is full, the LRU (least recently used) entry is removed from the cache.
* @param key the key with which the specified value is to be associated.
* @param value a value to be associated with the specified key.
*/
public synchronized void put (K key, V value) {
map.put (key, value); }
/**
* Clears the cache.
*/
public synchronized void clear() {
map.clear(); }
/**
* Returns the number of used entries in the cache.
* @return the number of entries currently in the cache.
*/
public synchronized int usedEntries() {
return map.size(); }
/**
* Returns a <code>Collection</code> that contains a copy of all cache entries.
* @return a <code>Collection</code> with a copy of the cache content.
*/
public synchronized Collection<Map.Entry<K,V>> getAll() {
return new ArrayList<Map.Entry<K,V>>(map.entrySet()); }
} // end class LRUCache
} }

View File

@ -0,0 +1,179 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.beagle;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.VCFHeader;
import org.broad.tribble.vcf.VCFHeaderLine;
import org.broad.tribble.vcf.VCFWriter;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import java.io.PrintStream;
import java.util.*;
/**
* Produces an input file to Beagle imputation engine, listing unphased, hard-called genotypes for a single sample
* in input variant file. Will additional hold back a fraction of the sites for evaluation, marking the
* genotypes at that sites as missing, and writing the truth of these sites to a second VCF file
*/
@Requires(value={},referenceMetaData=@RMD(name= VariantsToBeagleUnphasedWalker.ROD_NAME, type=VariantContext.class))
public class VariantsToBeagleUnphasedWalker extends RodWalker<Integer, Integer> {
public static final String ROD_NAME = "variant";
@Output(doc="File to which BEAGLE unphased genotypes should be written",required=true)
protected PrintStream beagleWriter = null;
@Argument(fullName = "bootstrap_fraction", shortName = "bs", doc = "Proportion of records to be used in bootstrap set", required = false)
public double bootstrap = 0.0;
@Argument(fullName = "bootstrap_vcf",shortName = "bsvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false)
VCFWriter bootstrapVCFOutput = null;
@Argument(fullName = "missing", shortName = "missing", doc = "String to identify missing data in beagle output", required = false)
public String MISSING = "?";
private Set<String> samples = null;
private int bootstrapSetSize = 0;
private int testSetSize = 0;
public void initialize() {
samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(ROD_NAME));
beagleWriter.print("I marker alleleA alleleB");
for ( String sample : samples )
beagleWriter.print(String.format(" %s %s", sample, sample));
beagleWriter.println();
if ( bootstrap < 0.0 | bootstrap > 1.0 )
throw new UserException.BadArgumentValue("bootstrap", "Bootstrap value must be fraction between 0 and 1");
if ( bootstrapVCFOutput != null ) {
Set<VCFHeaderLine> hInfo = VCFUtils.getHeaderFields(getToolkit());
bootstrapVCFOutput.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit())));
}
}
/**
* Iterate over each site, emitting the BEAGLE unphased genotypes file format
* @param tracker
* @param ref
* @param context
* @return
*/
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
if( tracker != null ) {
GenomeLoc loc = context.getLocation();
VariantContext vc = tracker.getVariantContext(ref, ROD_NAME, null, loc, true);
if ( ProduceBeagleInputWalker.canBeOutputToBeagle(vc) ) {
// do we want to hold back this site?
boolean makeMissing = dropSite(vc);
// if we are holding it back and we are writing a bootstrap VCF, write it out
if ( makeMissing && bootstrapVCFOutput != null ) {
bootstrapVCFOutput.add(vc, ref.getBase());
}
// regardless, all sites are written to the unphased genotypes file, marked as missing if appropriate
writeUnphasedBeagleOutput(vc, makeMissing);
}
}
return 0;
}
/**
* Do we want to hold back this site for bootstrap? Considers the bootstrap fraction member variable
*
* @param vc
* @return
*/
public boolean dropSite(VariantContext vc) {
if ( (bootstrapSetSize+1.0)/(1.0+bootstrapSetSize+testSetSize) <= bootstrap ) {
bootstrapSetSize++;
return true;
} else {
testSetSize++;
return false;
}
}
public void writeUnphasedBeagleOutput(VariantContext vc, boolean makeMissing) {
GenomeLoc currentLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),vc);
StringBuffer beagleOut = new StringBuffer();
String marker = String.format("%s:%d ",currentLoc.getContig(), currentLoc.getStart());
beagleOut.append("M ").append(marker);
// write out the alleles at this site
for ( Allele allele : vc.getAlleles() ) {
beagleOut.append(allele.isNoCall() || allele.isNull() ? "-" : allele.getBaseString()).append(" ");
}
// write out sample level genotypes
for ( String sample : samples ) {
Genotype genotype = vc.getGenotype(sample);
if ( ! makeMissing && genotype.isCalled() ) {
addAlleles(beagleOut, genotype);
} else {
addAlleles(beagleOut, MISSING, MISSING);
}
}
beagleWriter.println(beagleOut.toString());
}
private void addAlleles(StringBuffer buf, Genotype g) {
addAlleles(buf, g.getAllele(0).getBaseString(), g.getAllele(1).getBaseString());
}
private void addAlleles(StringBuffer buf, String a, String b) {
buf.append(a).append(" ").append(b);
}
public Integer reduceInit() { return 0; }
public Integer reduce( Integer value, Integer sum ) { return value + sum; }
public void onTraversalDone( Integer includedSites ) {
logger.info("Sites included in beagle genotypes file : " + includedSites);
}
}

View File

@ -22,6 +22,7 @@ import java.util.List;
public class VQSRCalibrationCurve { public class VQSRCalibrationCurve {
private final static boolean DEBUG = false; private final static boolean DEBUG = false;
List<VQSRRange> points; List<VQSRRange> points;
public static final double CERTAIN_FALSE_POSITIVE = -1;
private static class VQSRRange { private static class VQSRRange {
double start, stop, truePositiveRate; double start, stop, truePositiveRate;
@ -52,7 +53,9 @@ public class VQSRCalibrationCurve {
for ( String line : new XReadLines(source).readLines() ) { for ( String line : new XReadLines(source).readLines() ) {
if ( ! line.trim().isEmpty() ) { if ( ! line.trim().isEmpty() ) {
String[] parts = line.split("\\s+"); String[] parts = line.split("\\s+");
points.add(new VQSRRange(Double.parseDouble(parts[0]), Double.parseDouble(parts[1]), 1 - Double.parseDouble(parts[2]))); double fpRate = Double.parseDouble(parts[2]);
double tpRate = fpRate >= 1.0 ? CERTAIN_FALSE_POSITIVE : 1.0 - fpRate;
points.add(new VQSRRange(Double.parseDouble(parts[0]), Double.parseDouble(parts[1]), tpRate));
} }
} }
} catch ( FileNotFoundException e ) { } catch ( FileNotFoundException e ) {
@ -70,6 +73,11 @@ public class VQSRCalibrationCurve {
this.points = points; this.points = points;
} }
public boolean certainFalsePositive(String VQSRQualKey, VariantContext vc) {
return probTrueVariant(VQSRQualKey, vc) == CERTAIN_FALSE_POSITIVE;
}
public double probTrueVariant(double VQSRqual) { public double probTrueVariant(double VQSRqual) {
for ( VQSRRange r : points ) { for ( VQSRRange r : points ) {
if ( VQSRqual <= r.getStart() && VQSRqual > r.getStop() ) if ( VQSRqual <= r.getStart() && VQSRqual > r.getStop() )
@ -90,22 +98,34 @@ public class VQSRCalibrationCurve {
} }
} }
/**
* Returns a likelihoods vector adjusted by the probability that the site is an error. Returns a
* null vector if the probability of the site being real is 0.0
* @param VQSRQualKey
* @param vc
* @param log10Likelihoods
* @return
*/
public double[] includeErrorRateInLikelihoods(String VQSRQualKey, VariantContext vc, double[] log10Likelihoods) { public double[] includeErrorRateInLikelihoods(String VQSRQualKey, VariantContext vc, double[] log10Likelihoods) {
double[] updated = new double[log10Likelihoods.length]; double[] updated = new double[log10Likelihoods.length];
double alpha = probTrueVariant(VQSRQualKey, vc); double alpha = probTrueVariant(VQSRQualKey, vc);
double qual = vc.getAttributeAsDouble(VQSRQualKey); // todo -- remove me
if ( alpha == CERTAIN_FALSE_POSITIVE )
return null;
else {
double noInfoPr = 1.0 / 3; double noInfoPr = 1.0 / 3;
if ( DEBUG ) System.out.printf("------------------------------%n"); if ( DEBUG ) System.out.printf("------------------------------%n");
for ( int i = 0; i < log10Likelihoods.length; i++) { for ( int i = 0; i < log10Likelihoods.length; i++) {
double p = Math.pow(10, log10Likelihoods[i]); double p = Math.pow(10, log10Likelihoods[i]);
double q = alpha * p + (1-alpha) * noInfoPr; double q = alpha * p + (1-alpha) * noInfoPr;
if ( DEBUG ) System.out.printf(" vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", qual, p, alpha, q); if ( DEBUG ) System.out.printf(" vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", vc.getAttributeAsDouble(VQSRQualKey), p, alpha, q);
updated[i] = Math.log10(q); updated[i] = Math.log10(q);
} }
return updated; return updated;
} }
}
public void printInfo(Logger logger) { public void printInfo(Logger logger) {

View File

@ -20,9 +20,18 @@ class RefineGenotypesWithBeagle extends QScript {
@Argument(doc="X",required=false,shortName="cc") var CALIBRATION_CURVE: File = new File("vqsr.calibration.curve") @Argument(doc="X",required=false,shortName="cc") var CALIBRATION_CURVE: File = new File("vqsr.calibration.curve")
@Argument(doc="X",required=false,shortName="test") var TEST: Boolean = false @Argument(doc="X",required=false,shortName="test") var TEST: Boolean = false
@Argument(doc="Tmpdir",required=false,shortName="tmpdir") var TMPDIR: File = new File("./")
// assessing imputation performance
@Argument(doc="BAM File for genotyping",required=false, shortName="bam") var bam: File = null
@Argument(doc="Percent of sites that should be left out of BAM VCF to assess imputation", required=false, shortName="flo")
var fractionsLeftOut: List[Double] = List(0.1, 0.2, 0.5, 0.9)
// todo -- this might be best to think about in a different unit -- marker density per bp
val MISSING_KEY = "?"
val HM3_VCF: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf") val HM3_VCF: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf")
val OMNI_VCF: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf") val OMNI_VCF: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf")
val dbSNP_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132_b37.leftAligned.vcf"
trait GATKArgs extends CommandLineGATK { trait GATKArgs extends CommandLineGATK {
this.reference_sequence = qscript.reference this.reference_sequence = qscript.reference
@ -30,94 +39,194 @@ class RefineGenotypesWithBeagle extends QScript {
this.memoryLimit = Some(2) this.memoryLimit = Some(2)
} }
class GunzipFile(in: File, out:File ) extends CommandLineFunction { // --------------------------------------------------------------------------------
@Input(doc="file to gunzip") var inp = in //
@Output(doc="file to gunzip to") var outp = out // GENOTYPING SPECIFIC SITES IN A BAM FILE
//
// --------------------------------------------------------------------------------
def commandLine = "gunzip -c %s > %s".format(inp.getAbsolutePath, outp.getAbsolutePath) class GenotypeBAMAtSites(@Input bam: File, @Input sitesVCF: File, @Output genotypesVCF: File) extends UnifiedGenotyper with GATKArgs {
this.input_file = List(bam)
this.o = genotypesVCF
this.stand_call_conf = Some(0.0)
this.out_mode = Some(org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES)
this.gt_mode = Some(org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES)
this.rodBind :+= new RodBind("alleles","VCF",sitesVCF)
// we only want chromosome counts annotations
this.BTI = "alleles"
this.G = List("none")
this.A :+= "ChromosomeCounts"
this.nsl = true
// make sure we have the right intervals
if ( interval != null ) {
this.intervalsString = List(interval)
this.BTIMR = Some(org.broadinstitute.sting.utils.interval.IntervalSetRule.INTERSECTION)
}
} }
class BeagleRefinement(moreBeagleArgs: String = "") extends CommandLineFunction { // --------------------------------------------------------------------------------
@Input(doc="The beagle input file") var beagleInput: File = _ //
var beagleOutputBase: String = _ // BEAGLE COMMANDS
var beagleMemoryGigs: Int = BEAGLE_MEM_IN_GB //
// --------------------------------------------------------------------------------
/** class BeagleCommand(outputBase: String) extends CommandLineFunction {
* Note: These get set this.memoryLimit = Some(BEAGLE_MEM_IN_GB)
*/
@Output(doc="The beagle phased file") var beaglePhasedFile: File = _
@Output(doc="The beagle likelihood file") var beagleLikelihoods: File = _
@Output(doc="The beagle r2 file") var beagleRSquared: File = _
var beagleOutputDir: String = _
def freezeOutputs = { // Note: These get set
if ( beagleInput.getParent == null ) { @Output val beaglePhasedFile: File = new File(outputBase +".phased.gz")
beagleOutputDir = "" @Output val beagleLikelihoods: File = new File(outputBase +".gprobs.gz")
} else { @Output val beagleRSquared: File = new File(outputBase +".r2")
beagleOutputDir = beagleInput.getParent
} def commandLine = "java -Djava.io.tmpdir=%s -Xmx%dg -jar %s out=ignore.me omitprefix=true".format(TMPDIR, BEAGLE_MEM_IN_GB, beagleJar)
beaglePhasedFile = new File(beagleOutputDir + beagleOutputBase+"."+beagleInput.getName+".phased.gz")
beagleLikelihoods = new File(beagleOutputDir + beagleOutputBase+"."+beagleInput.getName+".gprobs.gz")
beagleRSquared = new File(beagleOutputDir + beagleOutputBase +"."+beagleInput.getName+".r2")
} }
def commandLine = "java -Djava.io.tmpdir=%s -Xmx%dg -jar %s like=%s %s out=%s".format(beagleInput.getParent,beagleMemoryGigs,beagleJar,beagleInput.getAbsolutePath,moreBeagleArgs,beagleOutputBase) class RefineGenotypesWithBeagle(@Input beagleInput: File, moreBeagleArgs: String = "")
extends BeagleCommand(beagleInput.getName) {
def myArgs = " like=%s %s".format(beagleInput.getAbsolutePath, moreBeagleArgs)
override def commandLine = super.commandLine + myArgs
} }
def RefineGenotypes(inputVCF: File, outputVCF: File, useCalibrationCurve: Boolean, moreBeagleArgs: String = "") = { class ImputeMissingGenotypesWithReferencePanel(@Input evalBeagle: File,
var beagleInput = new ProduceBeagleInput with GATKArgs @Input phasedBeagleFile: File,
@Input markers: File,
moreBeagleArgs: String = "")
extends BeagleCommand(evalBeagle.getName) {
def myArgs = " unphased=%s phased=%s markers=%s %s".format(evalBeagle.getAbsolutePath,
phasedBeagleFile.getAbsolutePath, markers.getAbsolutePath, moreBeagleArgs)
override def commandLine = super.commandLine + myArgs
}
class GunzipFile(@Input val in: File, @Output val out: File) extends CommandLineFunction {
def commandLine = "gunzip -c %s > %s".format(in.getAbsolutePath, out.getAbsolutePath)
}
// --------------------------------------------------------------------------------
//
// CREATING AND EVALUATING REFERENCE PANELS
//
// --------------------------------------------------------------------------------
class ReferencePanelBuilder(inputVCF: File, outputVCF: File, useCalibrationCurve: Boolean, moreBeagleArgs: String = "") {
val beagleInput = new ProduceBeagleInput with GATKArgs
if ( interval != null ) beagleInput.intervalsString = List(interval) if ( interval != null ) beagleInput.intervalsString = List(interval)
beagleInput.variantVCF = inputVCF beagleInput.variantVCF = inputVCF
beagleInput.out = swapExt(outputVCF,".vcf",".beagle") beagleInput.out = swapExt(outputVCF,".vcf",".beagle")
if ( useCalibrationCurve ) beagleInput.cc = CALIBRATION_CURVE if ( useCalibrationCurve ) beagleInput.cc = CALIBRATION_CURVE
beagleInput.markers = swapExt(outputVCF, ".vcf", ".markers.txt")
beagleInput.isIntermediate = true
var refine = new BeagleRefinement(moreBeagleArgs) val refine = new RefineGenotypesWithBeagle(beagleInput.out, moreBeagleArgs)
refine.beagleInput = beagleInput.out
refine.beagleOutputBase = outputVCF.getName + ".bout"
refine.beagleMemoryGigs = BEAGLE_MEM_IN_GB
refine.memoryLimit = Some(BEAGLE_MEM_IN_GB)
refine.freezeOutputs
var unzipPhased = new GunzipFile(refine.beaglePhasedFile,swapExt(refine.beaglePhasedFile,".gz",".bgl")) val unzipPhased = new GunzipFile(refine.beaglePhasedFile,swapExt(refine.beaglePhasedFile,".gz",".bgl"))
var unzipProbs = new GunzipFile(refine.beagleLikelihoods,swapExt(refine.beagleLikelihoods,".gz",".bgl")) val unzipProbs = new GunzipFile(refine.beagleLikelihoods,swapExt(refine.beagleLikelihoods,".gz",".bgl"))
// unzipPhased.isIntermediate = true //unzipPhased.isIntermediate = true
// unzipProbs.isIntermediate = true //unzipProbs.isIntermediate = true
var vcfConvert = new BeagleOutputToVCF with GATKArgs val vcfConvert = new BeagleOutputToVCF with GATKArgs
vcfConvert.variantVCF = inputVCF vcfConvert.variantVCF = inputVCF
vcfConvert.rodBind :+= new RodBind("beagleR2","BEAGLE",refine.beagleRSquared) vcfConvert.rodBind :+= new RodBind("beagleR2","BEAGLE",refine.beagleRSquared)
vcfConvert.rodBind :+= new RodBind("beaglePhased","BEAGLE",unzipPhased.outp) vcfConvert.rodBind :+= new RodBind("beaglePhased","BEAGLE",unzipPhased.out)
vcfConvert.rodBind :+= new RodBind("beagleProbs","BEAGLE",unzipProbs.outp) vcfConvert.rodBind :+= new RodBind("beagleProbs","BEAGLE",unzipProbs.out)
vcfConvert.out = outputVCF vcfConvert.out = outputVCF
def getMarkers = beagleInput.markers
def getPanelPhasedHaplotypes = refine.beaglePhasedFile
def getPanelVCF = vcfConvert.out
def enqueueCommands() = {
for ( cmd: CommandLineFunction <- List(beagleInput, refine, unzipPhased, unzipProbs, vcfConvert) ) for ( cmd: CommandLineFunction <- List(beagleInput, refine, unzipPhased, unzipProbs, vcfConvert) )
add(cmd) add(cmd)
} }
}
class MyEval(@Input(doc="Evaluation file") eval: File) extends VariantEval with GATKArgs { class EvaluateReferencePanel(@Input evalVCF: File,
@Output outputVCF: File,
panel: ReferencePanelBuilder,
percentLeftOut: Double,
moreBeagleArgs: String = "") {
val evalBeagle = new VariantsToBeagleUnphased with GATKArgs
if ( interval != null ) evalBeagle.intervalsString = List(interval)
evalBeagle.variantVCF = evalVCF
evalBeagle.out = swapExt(outputVCF,".vcf",".unphased.beagle")
evalBeagle.bs = Some(percentLeftOut)
evalBeagle.bsvcf = swapExt(outputVCF,".vcf",".missing.vcf")
evalBeagle.missing = MISSING_KEY
//evalBeagle.isIntermediate = true
val refine = new ImputeMissingGenotypesWithReferencePanel(evalBeagle.out, panel.getPanelPhasedHaplotypes, panel.getMarkers, moreBeagleArgs)
val unzipPhased = new GunzipFile(refine.beaglePhasedFile,swapExt(refine.beaglePhasedFile,".gz",".bgl"))
val unzipProbs = new GunzipFile(refine.beagleLikelihoods,swapExt(refine.beagleLikelihoods,".gz",".bgl"))
//unzipPhased.isIntermediate = true
//unzipProbs.isIntermediate = true
val vcfConvert = new BeagleOutputToVCF with GATKArgs
vcfConvert.variantVCF = evalVCF
vcfConvert.rodBind :+= new RodBind("beagleR2","BEAGLE",refine.beagleRSquared)
vcfConvert.rodBind :+= new RodBind("beaglePhased","BEAGLE",unzipPhased.out)
vcfConvert.rodBind :+= new RodBind("beagleProbs","BEAGLE",unzipProbs.out)
vcfConvert.out = outputVCF
vcfConvert.keep_monomorphic = true
def getBootstrap: File = evalBeagle.bsvcf
def enqueueCommands() = {
for ( cmd: CommandLineFunction <- List(evalBeagle, refine, unzipPhased, unzipProbs, vcfConvert) )
add(cmd)
}
}
class EvalPanelAtChipSites(@Input eval: File) extends VariantEval with GATKArgs {
this.noST = true this.noST = true
this.evalModule :+= "GenotypeConcordance" this.evalModule :+= "GenotypeConcordance"
this.o = swapExt(eval, ".vcf", ".vcf.eval") this.o = swapExt(eval, ".vcf", ".vcf.eval")
this.rodBind :+= RodBind("eval", "VCF", eval) this.rodBind :+= RodBind("eval", "VCF", eval)
this.rodBind :+= RodBind("comp_hm3", "VCF", HM3_VCF) this.rodBind :+= RodBind("comp_hm3", "VCF", HM3_VCF)
this.rodBind :+= RodBind("comp_omni", "VCF", OMNI_VCF) this.rodBind :+= RodBind("comp_omni", "VCF", OMNI_VCF)
if ( EvalInterval != null ) { if ( EvalInterval != null ) this.intervalsString = List(EvalInterval)
Console.printf("EvalInterval " + EvalInterval)
this.intervalsString = List(EvalInterval)
} }
class EvalPanelAtBAMCalledSites(@Input imputedVCF: File, @Input bamGenotypes: File, @Input bootstrap: File) extends VariantEval with GATKArgs {
this.evalModule :+= "GenotypeConcordance"
this.o = swapExt(imputedVCF, ".vcf", ".vcf.eval")
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP_b37)
this.rodBind :+= RodBind("eval", "VCF", imputedVCF)
this.rodBind :+= RodBind("comp_bam_genotypes", "VCF", bamGenotypes)
this.rodBind :+= RodBind("comp_bootstrap", "VCF", bootstrap)
if ( EvalInterval != null ) this.intervalsString = List(EvalInterval)
} }
def script = { def script = {
for ( vcf <- vcfsToBeagle ) { for ( vcf <- vcfsToBeagle ) {
if ( ! TEST ) add(new MyEval(vcf)) var bamGenotypes: File = null
// for ( niter <- List(10) ) { if ( bam != null ) {
bamGenotypes = new File(swapExt(bam, ".bam", "_genotyped_at." + vcf.getName).getName)
add(new GenotypeBAMAtSites(bam, vcf, bamGenotypes))
}
if ( ! TEST ) add(new EvalPanelAtChipSites(vcf))
for ( useCalibrationCurve <- List(true, false) ) { for ( useCalibrationCurve <- List(true, false) ) {
// for ( includeFilteredRecords <- List(true, false) ) { for ( niter <- List(10, 20, 50) ) {
for ( niter <- List(10, 25, 50) ) {
if ( ! TEST || (niter == 10 && ! useCalibrationCurve)) { if ( ! TEST || (niter == 10 && ! useCalibrationCurve)) {
val refine_out = swapExt(vcf, ".vcf", ".refined.niter_%d_cc_%b.vcf".format(niter, useCalibrationCurve)) val refine_out = swapExt(vcf, ".vcf", ".refined.niter_%d_cc_%b.vcf".format(niter, useCalibrationCurve))
RefineGenotypes(vcf, refine_out, useCalibrationCurve, "niterations=%d".format(niter)) val refPanel = new ReferencePanelBuilder(vcf, refine_out, useCalibrationCurve, "niterations=%d".format(niter))
add(new MyEval(refine_out)) refPanel.enqueueCommands()
// start up VE
add(new EvalPanelAtChipSites(refine_out))
if ( bamGenotypes != null ) {
for ( fractionLeftOut <- fractionsLeftOut ) {
val bamGenotypesImputed = swapExt(bamGenotypes, ".vcf", "_flo_%.2f.imputed.vcf".format(fractionLeftOut))
val args = "niterations=%d missing=\"%s\"".format(niter, MISSING_KEY)
val panelEval = new EvaluateReferencePanel(bamGenotypes, bamGenotypesImputed, refPanel, fractionLeftOut, args)
panelEval.enqueueCommands()
add(new EvalPanelAtBAMCalledSites(bamGenotypesImputed, bamGenotypes, panelEval.getBootstrap))
}
}
} }
} }
} }