Merge branch 'master' of ssh://chartl@ni.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Christopher Hartl 2012-01-26 12:38:24 -05:00
commit 9d4b84f6bd
17 changed files with 450 additions and 351 deletions

View File

@ -92,7 +92,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
// Call the walkers isActive function for this locus and add them to the list to be integrated later
if( initialIntervals.overlaps(location) ) {
final boolean isActive = walker.isActive( tracker, refContext, locus );
final boolean isActive = ( walker.presetActiveRegions == null ? walker.isActive( tracker, refContext, locus ) : walker.presetActiveRegions.overlaps(location) );
isActiveList.add( new ActiveRegion(location, isActive, engine.getGenomeLocParser(), activeRegionExtension ) );
}
@ -109,7 +109,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
if( !locusView.hasNext() ) {
// Call the walkers isActive function for this locus and add them to the list to be integrated later
if( initialIntervals.overlaps(location) ) {
final boolean isActive = walker.isActive( tracker, refContext, locus );
final boolean isActive = ( walker.presetActiveRegions == null ? walker.isActive( tracker, refContext, locus ) : walker.presetActiveRegions.overlaps(location) );
isActiveList.add( new ActiveRegion(location, isActive, engine.getGenomeLocParser(), activeRegionExtension ) );
}
@ -128,7 +128,16 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
// add these blocks of work to the work queue
final ArrayList<ActiveRegion> activeRegions = integrateActiveList( isActiveList );
logger.debug("Integrated " + isActiveList.size() + " isActive calls into " + activeRegions.size() + " regions." );
workQueue.addAll( activeRegions );
if( walker.activeRegionOutStream == null ) {
workQueue.addAll( activeRegions );
} else { // Just want to output the active regions to a file, not actually process them
for( final ActiveRegion activeRegion : activeRegions ) {
if( activeRegion.isActive ) {
walker.activeRegionOutStream.println( activeRegion.getLocation() );
}
}
}
// Since we've sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
if( !workQueue.isEmpty() ) {

View File

@ -1,6 +1,11 @@
package org.broadinstitute.sting.gatk.walkers;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.IntervalBinding;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter;
@ -14,8 +19,10 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;
@ -32,6 +39,31 @@ import java.util.List;
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class})
public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
@Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this interval list file", required = false)
public PrintStream activeRegionOutStream = null;
@Input(fullName="activeRegionIn", shortName="AR", doc="Use this interval list file as the active regions to process", required = false)
protected List<IntervalBinding<Feature>> activeRegionBindings = null;
public GenomeLocSortedSet presetActiveRegions = null;
@Override
public void initialize() {
if( activeRegionBindings == null ) { return; }
List<GenomeLoc> allIntervals = new ArrayList<GenomeLoc>(0);
for ( IntervalBinding intervalBinding : activeRegionBindings ) {
List<GenomeLoc> intervals = intervalBinding.getIntervals(this.getToolkit());
if ( intervals.isEmpty() ) {
logger.warn("The interval file " + intervalBinding.getSource() + " contains no intervals that could be parsed.");
}
allIntervals = IntervalUtils.mergeListsBySetOperator(intervals, allIntervals, IntervalSetRule.UNION);
}
presetActiveRegions = IntervalUtils.sortAndMergeIntervals(this.getToolkit().getGenomeLocParser(), allIntervals, IntervalMergingRule.ALL);
}
// Do we actually want to operate on the context?
public boolean filter(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
return true; // We are keeping all the reads

View File

@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation;
import org.broadinstitute.sting.utils.MendelianViolation;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
@ -23,7 +24,7 @@ import java.util.*;
* Time: 12:24 PM
* To change this template use File | Settings | File Templates.
*/
public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation {
public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation {
private MendelianViolation mendelianViolation = null;
private String motherId;

View File

@ -32,7 +32,6 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
import org.broadinstitute.sting.utils.BaseUtils;
@ -84,7 +83,6 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
public RodBinding<VariantContext> getVariantRodBinding() { return variantCollection.variants; }
/**
* The INFO field will be annotated with information on the most biologically-significant effect
@ -163,6 +161,13 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
@Argument(fullName="list", shortName="ls", doc="List the available annotations and exit")
protected Boolean LIST = false;
/**
* By default, the dbSNP ID is added only when the ID field in the variant VCF is empty.
*/
@Argument(fullName="alwaysAppendDbsnpId", shortName="alwaysAppendDbsnpId", doc="In conjunction with the dbSNP binding, append the dbSNP ID even when the variant VCF already has the ID field populated")
protected Boolean ALWAYS_APPEND_DBSNP_ID = false;
public boolean alwaysAppendDbsnpId() { return ALWAYS_APPEND_DBSNP_ID; }
@Hidden
@Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false)
protected boolean indelsOnly = false;

View File

@ -195,11 +195,20 @@ public class VariantAnnotatorEngine {
private VariantContext annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map<String, Object> infoAnnotations) {
for ( Map.Entry<RodBinding<VariantContext>, String> dbSet : dbAnnotations.entrySet() ) {
if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) {
String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType());
final String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType());
// put the DB key into the INFO field
infoAnnotations.put(VCFConstants.DBSNP_KEY, rsID != null);
// annotate dbsnp id if available and not already there
if ( rsID != null && vc.emptyID() )
vc = new VariantContextBuilder(vc).id(rsID).make();
// add the ID if appropriate
if ( rsID != null ) {
if ( vc.emptyID() ) {
vc = new VariantContextBuilder(vc).id(rsID).make();
} else if ( walker.alwaysAppendDbsnpId() && vc.getID().indexOf(rsID) == -1 ) {
final String newRsID = vc.getID() + VCFConstants.ID_FIELD_SEPARATOR + rsID;
vc = new VariantContextBuilder(vc).id(newRsID).make();
}
}
} else {
boolean overlapsComp = false;
for ( VariantContext comp : tracker.getValues(dbSet.getKey(), ref.getLocus()) ) {

View File

@ -8,9 +8,9 @@ import java.util.List;
public interface AnnotatorCompatibleWalker {
// getter methods for various used bindings
public abstract RodBinding<VariantContext> getVariantRodBinding();
public abstract RodBinding<VariantContext> getSnpEffRodBinding();
public abstract RodBinding<VariantContext> getDbsnpRodBinding();
public abstract List<RodBinding<VariantContext>> getCompRodBindings();
public abstract List<RodBinding<VariantContext>> getResourceRodBindings();
public abstract boolean alwaysAppendDbsnpId();
}

View File

@ -39,7 +39,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
super(UAC, N, logger, verboseWriter);
}
@ -166,7 +165,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
final int numChr = 2*numSamples;
// queue of AC conformations to process
final Queue<ExactACset> ACqueue = new LinkedList<ExactACset>();
final LinkedList<ExactACset> ACqueue = new LinkedList<ExactACset>();
// mapping of ExactACset indexes to the objects
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>(numChr+1);
@ -177,11 +176,11 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
ACqueue.add(zeroSet);
indexesToACset.put(zeroSet.ACcounts, zeroSet);
// optimization: create the temporary storage for computing L(j,k) just once
final int maxPossibleDependencies = numAlternateAlleles + (numAlternateAlleles * (numAlternateAlleles + 1) / 2) + 1;
final double[][] tempLog10ConformationLikelihoods = new double[numSamples+1][maxPossibleDependencies];
for ( int i = 0; i < maxPossibleDependencies; i++ )
tempLog10ConformationLikelihoods[0][i] = Double.NEGATIVE_INFINITY;
// optimization: create the temporary storage for computing L(j,k) just once
final int maxPossibleDependencies = numAlternateAlleles + (numAlternateAlleles * (numAlternateAlleles + 1) / 2) + 1;
final double[][] tempLog10ConformationLikelihoods = new double[numSamples+1][maxPossibleDependencies];
for ( int i = 0; i < maxPossibleDependencies; i++ )
tempLog10ConformationLikelihoods[0][i] = Double.NEGATIVE_INFINITY;
// keep processing while we have AC conformations that need to be calculated
double maxLog10L = Double.NEGATIVE_INFINITY;
@ -195,16 +194,26 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
}
private static final class DependentSet {
public final int[] ACcounts;
public final int PLindex;
public DependentSet(final int[] ACcounts, final int PLindex) {
this.ACcounts = ACcounts;
this.PLindex = PLindex;
}
}
private static double calculateAlleleCountConformation(final ExactACset set,
final ArrayList<double[]> genotypeLikelihoods,
final double maxLog10L,
final int numChr,
final boolean preserveData,
final Queue<ExactACset> ACqueue,
final LinkedList<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset,
final double[][] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result,
final double[][] tempLog10ConformationLikelihoods) {
final double[][] tempLog10ConformationLikelihoods) {
//if ( DEBUG )
// System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
@ -215,7 +224,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// clean up memory
if ( !preserveData ) {
for ( ExactACcounts index : set.dependentACsetsToDelete ) {
indexesToACset.put(index, null);
indexesToACset.remove(index);
//if ( DEBUG )
// System.out.printf(" *** removing used set=%s after seeing final dependent set=%s%n", index, set.ACcounts);
}
@ -230,7 +239,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// no reason to keep this data around because nothing depends on it
if ( !preserveData )
indexesToACset.put(set.ACcounts, null);
indexesToACset.remove(set.ACcounts);
return log10LofK;
}
@ -240,7 +249,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
return log10LofK;
ExactACset lastSet = null; // keep track of the last set placed in the queue so that we can tell it to clean us up when done processing
final int numAltAlleles = set.ACcounts.getCounts().length;
// genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods.
@ -251,30 +259,40 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
for ( int allele = 0; allele < numAltAlleles; allele++ ) {
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
ACcountsClone[allele]++;
lastSet = updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset);
updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset);
}
// add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different
if ( ACwiggle > 1 ) {
final ArrayList<DependentSet> differentAlleles = new ArrayList<DependentSet>(numAltAlleles * numAltAlleles);
final ArrayList<DependentSet> sameAlleles = new ArrayList<DependentSet>(numAltAlleles);
for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) {
for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) {
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
ACcountsClone[allele_i]++;
ACcountsClone[allele_j]++;
lastSet = updateACset(ACcountsClone, numChr, set, ++PLindex , ACqueue, indexesToACset);
if ( allele_i == allele_j )
sameAlleles.add(new DependentSet(ACcountsClone, ++PLindex));
else
differentAlleles.add(new DependentSet(ACcountsClone, ++PLindex));
}
}
// IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering
for ( DependentSet dependent : differentAlleles )
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset);
for ( DependentSet dependent : sameAlleles )
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset);
}
// if the last dependent set was not at the back of the queue (i.e. not just added), then we need to iterate
// over all the dependent sets to find the last one in the queue (otherwise it will be cleaned up too early)
if ( !preserveData && lastSet == null ) {
//if ( DEBUG )
// System.out.printf(" *** iterating over dependent sets for set=%s%n", set.ACcounts);
lastSet = determineLastDependentSetInQueue(set.ACcounts, ACqueue);
// determine which is the last dependent set in the queue (not necessarily the last one added above) so we can know when it is safe to clean up this column
if ( !preserveData ) {
final ExactACset lastSet = determineLastDependentSetInQueue(set.ACcounts, ACqueue);
if ( lastSet != null )
lastSet.dependentACsetsToDelete.add(set.ACcounts);
}
if ( lastSet != null )
lastSet.dependentACsetsToDelete.add(set.ACcounts);
return log10LofK;
}
@ -282,34 +300,36 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
// also adds it as a dependency to the given callingSetIndex.
// returns the ExactACset if that set was not already in the queue and null otherwise.
private static ExactACset updateACset(final int[] ACcounts,
final int numChr,
final ExactACset callingSet,
final int PLsetIndex,
final Queue<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
private static void updateACset(final int[] ACcounts,
final int numChr,
final ExactACset callingSet,
final int PLsetIndex,
final Queue<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
final ExactACcounts index = new ExactACcounts(ACcounts);
boolean wasInQueue = true;
if ( !indexesToACset.containsKey(index) ) {
ExactACset set = new ExactACset(numChr/2 +1, index);
indexesToACset.put(index, set);
ACqueue.add(set);
wasInQueue = false;
}
// add the given dependency to the set
//if ( DEBUG )
// System.out.println(" *** adding dependency from " + index + " to " + callingSet.ACcounts);
final ExactACset set = indexesToACset.get(index);
set.ACsetIndexToPLIndex.put(callingSet.ACcounts, PLsetIndex);
return wasInQueue ? null : set;
}
private static ExactACset determineLastDependentSetInQueue(final ExactACcounts callingSetIndex, final Queue<ExactACset> ACqueue) {
ExactACset set = null;
for ( ExactACset queued : ACqueue ) {
if ( queued.dependentACsetsToDelete.contains(callingSetIndex) )
set = queued;
private static ExactACset determineLastDependentSetInQueue(final ExactACcounts callingSetIndex, final LinkedList<ExactACset> ACqueue) {
Iterator<ExactACset> reverseIterator = ACqueue.descendingIterator();
while ( reverseIterator.hasNext() ) {
final ExactACset queued = reverseIterator.next();
if ( queued.ACsetIndexToPLIndex.containsKey(callingSetIndex) )
return queued;
}
return set;
// shouldn't get here
throw new ReviewedStingException("Error: no sets in the queue currently hold " + callingSetIndex + " as a dependent!");
}
private static void computeLofK(final ExactACset set,
@ -317,7 +337,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
final HashMap<ExactACcounts, ExactACset> indexesToACset,
final double[][] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result,
final double[][] tempLog10ConformationLikelihoods) {
final double[][] tempLog10ConformationLikelihoods) {
set.log10Likelihoods[0] = 0.0; // the zero case
final int totalK = set.getACsum();
@ -329,40 +349,40 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
// k > 0 for at least one k
else {
// deal with the non-AA possible conformations
int conformationIndex = 1;
for ( Map.Entry<ExactACcounts, Integer> mapping : set.ACsetIndexToPLIndex.entrySet() ) {
//if ( DEBUG )
// System.out.printf(" *** evaluating set=%s which depends on set=%s%n", set.ACcounts, mapping.getKey());
// deal with the non-AA possible conformations
int conformationIndex = 1;
for ( Map.Entry<ExactACcounts, Integer> mapping : set.ACsetIndexToPLIndex.entrySet() ) {
//if ( DEBUG )
// System.out.printf(" *** evaluating set=%s which depends on set=%s%n", set.ACcounts, mapping.getKey());
ExactACset dependent = indexesToACset.get(mapping.getKey());
ExactACset dependent = indexesToACset.get(mapping.getKey());
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
if ( totalK <= 2*j ) { // skip impossible conformations
final double[] gl = genotypeLikelihoods.get(j);
tempLog10ConformationLikelihoods[j][conformationIndex] =
determineCoefficient(mapping.getValue(), j, set.ACcounts.getCounts(), totalK) + dependent.log10Likelihoods[j-1] + gl[mapping.getValue()];
if ( totalK <= 2*j ) { // skip impossible conformations
final double[] gl = genotypeLikelihoods.get(j);
tempLog10ConformationLikelihoods[j][conformationIndex] =
determineCoefficient(mapping.getValue(), j, set.ACcounts.getCounts(), totalK) + dependent.log10Likelihoods[j-1] + gl[mapping.getValue()];
} else {
tempLog10ConformationLikelihoods[j][conformationIndex] = Double.NEGATIVE_INFINITY;
}
tempLog10ConformationLikelihoods[j][conformationIndex] = Double.NEGATIVE_INFINITY;
}
}
conformationIndex++;
}
conformationIndex++;
}
// finally, deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
// finally, deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
final int numPaths = set.ACsetIndexToPLIndex.size() + 1;
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
if ( totalK < 2*j-1 ) {
final double[] gl = genotypeLikelihoods.get(j);
tempLog10ConformationLikelihoods[j][0] = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
} else {
tempLog10ConformationLikelihoods[j][0] = Double.NEGATIVE_INFINITY;
}
if ( totalK < 2*j-1 ) {
final double[] gl = genotypeLikelihoods.get(j);
tempLog10ConformationLikelihoods[j][0] = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
} else {
tempLog10ConformationLikelihoods[j][0] = Double.NEGATIVE_INFINITY;
}
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
final double log10Max = MathUtils.approximateLog10SumLog10(tempLog10ConformationLikelihoods[j], numPaths);
set.log10Likelihoods[j] = log10Max - logDenominator;
}

View File

@ -1,114 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.HashSet;
import java.util.Set;
/**
* Uses the UG engine to determine per-sample genotype likelihoods and emits them as a VCF (using PLs).
* Absolutely not supported or recommended for public use.
* Run this as you would the UnifiedGenotyper, except that you must additionally pass in a VCF bound to
* the name 'allele' so we know which alternate allele to use at each site.
*/
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_INPUT)
@Reference(window=@Window(start=-200,stop=200))
@By(DataSource.READS)
@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250)
public class UGCalcLikelihoods extends LocusWalker<VariantCallContext, Integer> implements TreeReducible<Integer> {
@ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
// control the output
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter writer = null;
// the calculation arguments
private UnifiedGenotyperEngine UG_engine = null;
// enable deletions in the pileup
public boolean includeReadsWithDeletionAtLoci() { return true; }
// enable extended events for indels
public boolean generateExtendedEvents() { return UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.SNP; }
public void initialize() {
// get all of the unique sample names
Set<String> samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples);
// initialize the header
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
writer.writeHeader(new VCFHeader(headerInfo, samples)) ;
}
public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
VariantContext call = UG_engine.calculateLikelihoods(tracker, refContext, rawContext);
return call == null ? null : new VariantCallContext(call, true);
}
public Integer reduceInit() { return 0; }
public Integer treeReduce(Integer lhs, Integer rhs) {
return lhs + rhs;
}
public Integer reduce(VariantCallContext value, Integer sum) {
if ( value == null )
return sum;
try {
writer.add(value);
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name");
}
return sum + 1;
}
public void onTraversalDone(Integer sum) {
logger.info(String.format("Visited bases: %d", sum));
}
}

View File

@ -1,152 +0,0 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
/**
* Uses the UG engine to call variants based off of VCFs annotated with GLs (or PLs).
* Absolutely not supported or recommended for public use.
* Run this as you would the UnifiedGenotyper, except that instead of '-I reads' it expects any number
* of GL/PL-annotated VCFs bound to a name starting with 'variant'.
*/
public class UGCallVariants extends RodWalker<VariantCallContext, Integer> {
@ArgumentCollection
private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
@Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true)
public List<RodBinding<VariantContext>> variants;
// control the output
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter writer = null;
// the calculation arguments
private UnifiedGenotyperEngine UG_engine = null;
// variant track names
private Set<String> trackNames = new HashSet<String>();
public void initialize() {
for ( RodBinding<VariantContext> rb : variants )
trackNames.add(rb.getName());
Set<String> samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), trackNames);
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples);
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, -1, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed"));
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, -1, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed"));
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING )
headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality"));
// initialize the header
writer.writeHeader(new VCFHeader(headerInfo, samples));
}
public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null )
return null;
List<VariantContext> VCs = tracker.getValues(variants, context.getLocation());
VariantContext mergedVC = mergeVCsWithGLs(VCs);
if ( mergedVC == null )
return null;
return UG_engine.calculateGenotypes(tracker, ref, context, mergedVC);
}
public Integer reduceInit() { return 0; }
public Integer reduce(VariantCallContext value, Integer sum) {
if ( value == null )
return sum;
try {
VariantContextBuilder builder = new VariantContextBuilder(value);
VariantContextUtils.calculateChromosomeCounts(builder, true);
writer.add(builder.make());
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name");
}
return sum + 1;
}
public void onTraversalDone(Integer result) {
logger.info(String.format("Visited sites: %d", result));
}
private static VariantContext mergeVCsWithGLs(List<VariantContext> VCs) {
// we can't use the VCUtils classes because our VCs can all be no-calls
if ( VCs.size() == 0 )
return null;
VariantContext variantVC = null;
GenotypesContext genotypes = GenotypesContext.create();
for ( VariantContext vc : VCs ) {
if ( variantVC == null && vc.isVariant() )
variantVC = vc;
genotypes.addAll(getGenotypesWithGLs(vc.getGenotypes()));
}
if ( variantVC == null ) {
VariantContext vc = VCs.get(0);
throw new UserException("There is no ALT allele in any of the VCF records passed in at " + vc.getChr() + ":" + vc.getStart());
}
return new VariantContextBuilder(variantVC).source("VCwithGLs").genotypes(genotypes).make();
}
private static GenotypesContext getGenotypesWithGLs(GenotypesContext genotypes) {
GenotypesContext genotypesWithGLs = GenotypesContext.create(genotypes.size());
for ( final Genotype g : genotypes ) {
if ( g.hasLikelihoods() && g.getLikelihoods().getAsVector() != null )
genotypesWithGLs.add(g);
}
return genotypesWithGLs;
}
}

View File

@ -126,10 +126,10 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
public RodBinding<VariantContext> getDbsnpRodBinding() { return dbsnp.dbsnp; }
public RodBinding<VariantContext> getVariantRodBinding() { return null; }
public RodBinding<VariantContext> getSnpEffRodBinding() { return null; }
public List<RodBinding<VariantContext>> getCompRodBindings() { return Collections.emptyList(); }
public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); }
public boolean alwaysAppendDbsnpId() { return false; }
/**
* A raw, unfiltered, highly specific callset in VCF format.
@ -205,6 +205,12 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
*
**/
public void initialize() {
// warn the user for misusing EMIT_ALL_SITES
if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES &&
UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY &&
UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.SNP )
logger.warn("WARNING: note that the EMIT_ALL_SITES option is intended only for point mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by no means produce a comprehensive set of indels in DISCOVERY mode");
// get all of the unique sample names
Set<String> samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());

View File

@ -54,8 +54,9 @@ public class UnifiedGenotyperEngine {
EMIT_VARIANTS_ONLY,
/** produces calls at variant sites and confident reference sites */
EMIT_ALL_CONFIDENT_SITES,
/** produces calls at any callable site regardless of confidence; this argument is intended for point
* mutations (SNPs) only and while some indel calls may be produced they are by no means comprehensive */
/** produces calls at any callable site regardless of confidence; this argument is intended only for point
* mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by
* no means produce a comprehensive set of indels in DISCOVERY mode */
EMIT_ALL_SITES
}

View File

@ -0,0 +1,233 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
@Analysis(description = "Evaluation summary for multi-allelic variants")
public class MultiallelicSummary extends VariantEvaluator { // implements StandardEval {
final protected static Logger logger = Logger.getLogger(MultiallelicSummary.class);
public enum Type {
SNP, INDEL
}
// basic counts on various rates found
@DataPoint(description = "Number of processed loci")
public long nProcessedLoci = 0;
@DataPoint(description = "Number of SNPs")
public int nSNPs = 0;
@DataPoint(description = "Number of multi-allelic SNPs")
public int nMultiSNPs = 0;
@DataPoint(description = "% processed sites that are multi-allelic SNPs", format = "%.5f")
public double processedMultiSnpRatio = 0;
@DataPoint(description = "% SNP sites that are multi-allelic", format = "%.3f")
public double variantMultiSnpRatio = 0;
@DataPoint(description = "Number of Indels")
public int nIndels = 0;
@DataPoint(description = "Number of multi-allelic Indels")
public int nMultiIndels = 0;
@DataPoint(description = "% processed sites that are multi-allelic Indels", format = "%.5f")
public double processedMultiIndelRatio = 0;
@DataPoint(description = "% Indel sites that are multi-allelic", format = "%.3f")
public double variantMultiIndelRatio = 0;
@DataPoint(description = "Number of Transitions")
public int nTi = 0;
@DataPoint(description = "Number of Transversions")
public int nTv = 0;
@DataPoint(description = "Overall TiTv ratio", format = "%.2f")
public double TiTvRatio = 0;
@DataPoint(description = "Multi-allelic SNPs partially known")
public int knownSNPsPartial = 0;
@DataPoint(description = "Multi-allelic SNPs completely known")
public int knownSNPsComplete = 0;
@DataPoint(description = "Multi-allelic SNP Novelty Rate")
public String SNPNoveltyRate = "NA";
@DataPoint(description = "Multi-allelic Indels partially known")
public int knownIndelsPartial = 0;
@DataPoint(description = "Multi-allelic Indels completely known")
public int knownIndelsComplete = 0;
@DataPoint(description = "Multi-allelic Indel Novelty Rate")
public String indelNoveltyRate = "NA";
@DataPoint(description="Histogram of allele frequencies")
AFHistogram AFhistogram = new AFHistogram();
/*
* AF histogram table object
*/
static class AFHistogram implements TableType {
private Object[] colKeys, rowKeys = {"pairwise_AF"};
private int[] AFhistogram;
private static final double AFincrement = 0.01;
private static final int numBins = (int)(1.00 / AFincrement);
public AFHistogram() {
colKeys = initColKeys();
AFhistogram = new int[colKeys.length];
}
public Object[] getColumnKeys() {
return colKeys;
}
public Object[] getRowKeys() {
return rowKeys;
}
public Object getCell(int row, int col) {
return AFhistogram[col];
}
private static Object[] initColKeys() {
ArrayList<String> keyList = new ArrayList<String>(numBins + 1);
for ( double a = 0.00; a <= 1.01; a += AFincrement ) {
keyList.add(String.format("%.2f", a));
}
return keyList.toArray();
}
public String getName() { return "AFHistTable"; }
public void update(VariantContext vc) {
final Object obj = vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY, null);
if ( obj == null || !(obj instanceof List) )
return;
List<String> list = (List<String>)obj;
for ( String str : list ) {
final double AF = Double.valueOf(str);
final int bin = (int)(numBins * MathUtils.round(AF, 2));
AFhistogram[bin]++;
}
}
}
public void initialize(VariantEvalWalker walker) {}
@Override public boolean enabled() { return true; }
public int getComparisonOrder() {
return 2;
}
public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1);
}
public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( eval == null || eval.isMonomorphicInSamples() )
return null;
// update counts
switch ( eval.getType() ) {
case SNP:
nSNPs++;
if ( !eval.isBiallelic() ) {
nMultiSNPs++;
calculatePairwiseTiTv(eval);
calculateSNPPairwiseNovelty(eval, comp);
}
break;
case INDEL:
nIndels++;
if ( !eval.isBiallelic() ) {
nMultiIndels++;
calculateIndelPairwiseNovelty(eval, comp);
}
break;
default:
throw new UserException.BadInput("Unexpected variant context type: " + eval);
}
AFhistogram.update(eval);
return null; // we don't capture any interesting sites
}
private void calculatePairwiseTiTv(VariantContext vc) {
for ( Allele alt : vc.getAlternateAlleles() ) {
if ( VariantContextUtils.isTransition(vc.getReference(), alt) )
nTi++;
else
nTv++;
}
}
private void calculateSNPPairwiseNovelty(VariantContext eval, VariantContext comp) {
if ( comp == null )
return;
int knownAlleles = 0;
for ( Allele alt : eval.getAlternateAlleles() ) {
if ( comp.getAlternateAlleles().contains(alt) )
knownAlleles++;
}
if ( knownAlleles == eval.getAlternateAlleles().size() )
knownSNPsComplete++;
else if ( knownAlleles > 0 )
knownSNPsPartial++;
}
private void calculateIndelPairwiseNovelty(VariantContext eval, VariantContext comp) {
}
private final String noveltyRate(final int all, final int known) {
final int novel = all - known;
final double rate = (novel / (1.0 * all));
return all == 0 ? "NA" : String.format("%.2f", rate);
}
public void finalizeEvaluation() {
processedMultiSnpRatio = (double)nMultiSNPs / (double)nProcessedLoci;
variantMultiSnpRatio = (double)nMultiSNPs / (double)nSNPs;
processedMultiIndelRatio = (double)nMultiIndels / (double)nProcessedLoci;
variantMultiIndelRatio = (double)nMultiIndels / (double)nIndels;
TiTvRatio = (double)nTi / (double)nTv;
SNPNoveltyRate = noveltyRate(nMultiSNPs, knownSNPsPartial + knownSNPsComplete);
indelNoveltyRate = noveltyRate(nMultiSNPs, knownIndelsPartial + knownIndelsComplete);
}
}

View File

@ -120,6 +120,10 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
@Argument(shortName="filteredRecordsMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields", required=false)
public VariantContextUtils.FilteredRecordMergeType filteredRecordsMergeType = VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED;
@Hidden
@Argument(shortName="multipleAllelesMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different allele types (for example, SNP vs. indel)", required=false)
public VariantContextUtils.MultipleAllelesMergeType multipleAllelesMergeType = VariantContextUtils.MultipleAllelesMergeType.BY_TYPE;
/**
* Used when taking the union of variants that contain genotypes. A complete priority list MUST be provided.
*/
@ -236,13 +240,24 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
return 0;
List<VariantContext> mergedVCs = new ArrayList<VariantContext>();
Map<VariantContext.Type, List<VariantContext>> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs);
// iterate over the types so that it's deterministic
for ( VariantContext.Type type : VariantContext.Type.values() ) {
if ( VCsByType.containsKey(type) )
mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type),
priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
if (multipleAllelesMergeType == VariantContextUtils.MultipleAllelesMergeType.BY_TYPE) {
Map<VariantContext.Type, List<VariantContext>> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs);
// iterate over the types so that it's deterministic
for (VariantContext.Type type : VariantContext.Type.values()) {
if (VCsByType.containsKey(type))
mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type),
priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
}
}
else if (multipleAllelesMergeType == VariantContextUtils.MultipleAllelesMergeType.MIX_TYPES) {
mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), vcs,
priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
}
else {
logger.warn("Ignoring all records at site " + ref.getLocus());
}
for ( VariantContext mergedVC : mergedVCs ) {

View File

@ -24,6 +24,7 @@
package org.broadinstitute.sting.utils.codecs.vcf;
import org.broad.tribble.TribbleException;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Arrays;
@ -149,7 +150,11 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF
count = Integer.valueOf(numberStr);
}
type = VCFHeaderLineType.valueOf(mapping.get("Type"));
try {
type = VCFHeaderLineType.valueOf(mapping.get("Type"));
} catch (Exception e) {
throw new TribbleException(mapping.get("Type") + " is not a valid type in the VCF specification (note that types are case-sensitive)");
}
if (type == VCFHeaderLineType.Flag && !allowFlagValues())
throw new IllegalArgumentException("Flag is an unsupported type for this kind of field");

View File

@ -29,6 +29,7 @@ import org.apache.commons.jexl2.Expression;
import org.apache.commons.jexl2.JexlEngine;
import org.apache.log4j.Logger;
import org.broad.tribble.util.popgen.HardyWeinbergCalculation;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
@ -471,6 +472,18 @@ public class VariantContextUtils {
KEEP_UNCONDITIONAL
}
@Hidden
public enum MultipleAllelesMergeType {
/**
* Combine only alleles of the same type (SNP, indel, etc.) into a single VCF record.
*/
BY_TYPE,
/**
* Merge all allele types at the same start position into the same VCF record.
*/
MIX_TYPES
}
/**
* Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided.
* If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with
@ -1060,6 +1073,14 @@ public class VariantContextUtils {
return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSVERSION;
}
public static boolean isTransition(Allele ref, Allele alt) {
return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSITION;
}
public static boolean isTransversion(Allele ref, Allele alt) {
return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSVERSION;
}
/**
* create a genome location, given a variant context
* @param genomeLocParser parser

View File

@ -110,6 +110,14 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
executeTest("getting DB tag with dbSNP", spec);
}
@Test
public void testMultipleIdsWithDbsnp() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --alwaysAppendDbsnpId --dbsnp " + b36dbSNP129 + " -G Standard --variant " + validationDataLocation + "vcfexample3withIDs.vcf -L " + validationDataLocation + "vcfexample3withIDs.vcf", 1,
Arrays.asList("cd7e3d43b8f5579c461b3e588a295fa8"));
executeTest("adding multiple IDs with dbSNP", spec);
}
@Test
public void testDBTagWithHapMap() {
WalkerTestSpec spec = new WalkerTestSpec(

View File

@ -294,7 +294,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec(
baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation +
"phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1,
Arrays.asList("877de5b0cc61dc54636062df6399b978"));
Arrays.asList("1d1956fd7b0f0d30935674b2f5019860"));
executeTest("test MultiSample Phase1 indels with complicated records", spec4);
}