Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Eric Banks 2012-01-25 19:19:03 -05:00
commit cba5f1a8b1
5 changed files with 38 additions and 275 deletions

View File

@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.PrintStream;
@ -39,6 +38,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
// TODO: PERMITS WALKER USED TO HAVE A TEMPORARY FIX to prevent NullPointerException caused by bug:
public static boolean PRESERVE_AC_DATA = false;
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
super(UAC, N, logger, verboseWriter);
@ -51,7 +52,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
final int numAlleles = alleles.size();
//linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, result, false);
linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, result, PRESERVE_AC_DATA);
}
private static final ArrayList<double[]> getGLs(GenotypesContext GLs) {

View File

@ -1,114 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.HashSet;
import java.util.Set;
/**
* Uses the UG engine to determine per-sample genotype likelihoods and emits them as a VCF (using PLs).
* Absolutely not supported or recommended for public use.
* Run this as you would the UnifiedGenotyper, except that you must additionally pass in a VCF bound to
* the name 'allele' so we know which alternate allele to use at each site.
*/
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_INPUT)
@Reference(window=@Window(start=-200,stop=200))
@By(DataSource.READS)
@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250)
public class UGCalcLikelihoods extends LocusWalker<VariantCallContext, Integer> implements TreeReducible<Integer> {
@ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
// control the output
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter writer = null;
// the calculation arguments
private UnifiedGenotyperEngine UG_engine = null;
// enable deletions in the pileup
public boolean includeReadsWithDeletionAtLoci() { return true; }
// enable extended events for indels
public boolean generateExtendedEvents() { return UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.SNP; }
public void initialize() {
// get all of the unique sample names
Set<String> samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples);
// initialize the header
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
writer.writeHeader(new VCFHeader(headerInfo, samples)) ;
}
public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
VariantContext call = UG_engine.calculateLikelihoods(tracker, refContext, rawContext);
return call == null ? null : new VariantCallContext(call, true);
}
public Integer reduceInit() { return 0; }
public Integer treeReduce(Integer lhs, Integer rhs) {
return lhs + rhs;
}
public Integer reduce(VariantCallContext value, Integer sum) {
if ( value == null )
return sum;
try {
writer.add(value);
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name");
}
return sum + 1;
}
public void onTraversalDone(Integer sum) {
logger.info(String.format("Visited bases: %d", sum));
}
}

View File

@ -1,152 +0,0 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
/**
* Uses the UG engine to call variants based off of VCFs annotated with GLs (or PLs).
* Absolutely not supported or recommended for public use.
* Run this as you would the UnifiedGenotyper, except that instead of '-I reads' it expects any number
* of GL/PL-annotated VCFs bound to a name starting with 'variant'.
*/
public class UGCallVariants extends RodWalker<VariantCallContext, Integer> {
@ArgumentCollection
private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
@Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true)
public List<RodBinding<VariantContext>> variants;
// control the output
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter writer = null;
// the calculation arguments
private UnifiedGenotyperEngine UG_engine = null;
// variant track names
private Set<String> trackNames = new HashSet<String>();
public void initialize() {
for ( RodBinding<VariantContext> rb : variants )
trackNames.add(rb.getName());
Set<String> samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), trackNames);
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples);
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, -1, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed"));
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, -1, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed"));
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING )
headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality"));
// initialize the header
writer.writeHeader(new VCFHeader(headerInfo, samples));
}
public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null )
return null;
List<VariantContext> VCs = tracker.getValues(variants, context.getLocation());
VariantContext mergedVC = mergeVCsWithGLs(VCs);
if ( mergedVC == null )
return null;
return UG_engine.calculateGenotypes(tracker, ref, context, mergedVC);
}
public Integer reduceInit() { return 0; }
public Integer reduce(VariantCallContext value, Integer sum) {
if ( value == null )
return sum;
try {
VariantContextBuilder builder = new VariantContextBuilder(value);
VariantContextUtils.calculateChromosomeCounts(builder, true);
writer.add(builder.make());
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name");
}
return sum + 1;
}
public void onTraversalDone(Integer result) {
logger.info(String.format("Visited sites: %d", result));
}
private static VariantContext mergeVCsWithGLs(List<VariantContext> VCs) {
// we can't use the VCUtils classes because our VCs can all be no-calls
if ( VCs.size() == 0 )
return null;
VariantContext variantVC = null;
GenotypesContext genotypes = GenotypesContext.create();
for ( VariantContext vc : VCs ) {
if ( variantVC == null && vc.isVariant() )
variantVC = vc;
genotypes.addAll(getGenotypesWithGLs(vc.getGenotypes()));
}
if ( variantVC == null ) {
VariantContext vc = VCs.get(0);
throw new UserException("There is no ALT allele in any of the VCF records passed in at " + vc.getChr() + ":" + vc.getStart());
}
return new VariantContextBuilder(variantVC).source("VCwithGLs").genotypes(genotypes).make();
}
private static GenotypesContext getGenotypesWithGLs(GenotypesContext genotypes) {
GenotypesContext genotypesWithGLs = GenotypesContext.create(genotypes.size());
for ( final Genotype g : genotypes ) {
if ( g.hasLikelihoods() && g.getLikelihoods().getAsVector() != null )
genotypesWithGLs.add(g);
}
return genotypesWithGLs;
}
}

View File

@ -120,6 +120,10 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
@Argument(shortName="filteredRecordsMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields", required=false)
public VariantContextUtils.FilteredRecordMergeType filteredRecordsMergeType = VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED;
@Hidden
@Argument(shortName="multipleAllelesMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different allele types (for example, SNP vs. indel)", required=false)
public VariantContextUtils.MultipleAllelesMergeType multipleAllelesMergeType = VariantContextUtils.MultipleAllelesMergeType.BY_TYPE;
/**
* Used when taking the union of variants that contain genotypes. A complete priority list MUST be provided.
*/
@ -236,13 +240,24 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
return 0;
List<VariantContext> mergedVCs = new ArrayList<VariantContext>();
Map<VariantContext.Type, List<VariantContext>> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs);
// iterate over the types so that it's deterministic
for ( VariantContext.Type type : VariantContext.Type.values() ) {
if ( VCsByType.containsKey(type) )
mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type),
priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
if (multipleAllelesMergeType == VariantContextUtils.MultipleAllelesMergeType.BY_TYPE) {
Map<VariantContext.Type, List<VariantContext>> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs);
// iterate over the types so that it's deterministic
for (VariantContext.Type type : VariantContext.Type.values()) {
if (VCsByType.containsKey(type))
mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type),
priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
}
}
else if (multipleAllelesMergeType == VariantContextUtils.MultipleAllelesMergeType.MIX_TYPES) {
mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), vcs,
priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
}
else {
logger.warn("Ignoring all records at site " + ref.getLocus());
}
for ( VariantContext mergedVC : mergedVCs ) {

View File

@ -29,6 +29,7 @@ import org.apache.commons.jexl2.Expression;
import org.apache.commons.jexl2.JexlEngine;
import org.apache.log4j.Logger;
import org.broad.tribble.util.popgen.HardyWeinbergCalculation;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
@ -471,6 +472,18 @@ public class VariantContextUtils {
KEEP_UNCONDITIONAL
}
@Hidden
public enum MultipleAllelesMergeType {
/**
* Combine only alleles of the same type (SNP, indel, etc.) into a single VCF record.
*/
BY_TYPE,
/**
* Merge all allele types at the same start position into the same VCF record.
*/
MIX_TYPES
}
/**
* Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided.
* If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with