Merge branch 'master' of ssh://chartl@ni.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Christopher Hartl 2012-01-26 12:38:24 -05:00
commit 9d4b84f6bd
17 changed files with 450 additions and 351 deletions

View File

@ -92,7 +92,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
// Call the walkers isActive function for this locus and add them to the list to be integrated later // Call the walkers isActive function for this locus and add them to the list to be integrated later
if( initialIntervals.overlaps(location) ) { if( initialIntervals.overlaps(location) ) {
final boolean isActive = walker.isActive( tracker, refContext, locus ); final boolean isActive = ( walker.presetActiveRegions == null ? walker.isActive( tracker, refContext, locus ) : walker.presetActiveRegions.overlaps(location) );
isActiveList.add( new ActiveRegion(location, isActive, engine.getGenomeLocParser(), activeRegionExtension ) ); isActiveList.add( new ActiveRegion(location, isActive, engine.getGenomeLocParser(), activeRegionExtension ) );
} }
@ -109,7 +109,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
if( !locusView.hasNext() ) { if( !locusView.hasNext() ) {
// Call the walkers isActive function for this locus and add them to the list to be integrated later // Call the walkers isActive function for this locus and add them to the list to be integrated later
if( initialIntervals.overlaps(location) ) { if( initialIntervals.overlaps(location) ) {
final boolean isActive = walker.isActive( tracker, refContext, locus ); final boolean isActive = ( walker.presetActiveRegions == null ? walker.isActive( tracker, refContext, locus ) : walker.presetActiveRegions.overlaps(location) );
isActiveList.add( new ActiveRegion(location, isActive, engine.getGenomeLocParser(), activeRegionExtension ) ); isActiveList.add( new ActiveRegion(location, isActive, engine.getGenomeLocParser(), activeRegionExtension ) );
} }
@ -128,7 +128,16 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
// add these blocks of work to the work queue // add these blocks of work to the work queue
final ArrayList<ActiveRegion> activeRegions = integrateActiveList( isActiveList ); final ArrayList<ActiveRegion> activeRegions = integrateActiveList( isActiveList );
logger.debug("Integrated " + isActiveList.size() + " isActive calls into " + activeRegions.size() + " regions." ); logger.debug("Integrated " + isActiveList.size() + " isActive calls into " + activeRegions.size() + " regions." );
workQueue.addAll( activeRegions ); if( walker.activeRegionOutStream == null ) {
workQueue.addAll( activeRegions );
} else { // Just want to output the active regions to a file, not actually process them
for( final ActiveRegion activeRegion : activeRegions ) {
if( activeRegion.isActive ) {
walker.activeRegionOutStream.println( activeRegion.getLocation() );
}
}
}
// Since we've sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them // Since we've sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
if( !workQueue.isEmpty() ) { if( !workQueue.isEmpty() ) {

View File

@ -1,6 +1,11 @@
package org.broadinstitute.sting.gatk.walkers; package org.broadinstitute.sting.gatk.walkers;
import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.IntervalBinding;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter; import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter;
@ -14,8 +19,10 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.interval.IntervalUtils;
import java.io.PrintStream;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -32,6 +39,31 @@ import java.util.List;
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class}) @ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class})
public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> { public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
@Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this interval list file", required = false)
public PrintStream activeRegionOutStream = null;
@Input(fullName="activeRegionIn", shortName="AR", doc="Use this interval list file as the active regions to process", required = false)
protected List<IntervalBinding<Feature>> activeRegionBindings = null;
public GenomeLocSortedSet presetActiveRegions = null;
@Override
public void initialize() {
if( activeRegionBindings == null ) { return; }
List<GenomeLoc> allIntervals = new ArrayList<GenomeLoc>(0);
for ( IntervalBinding intervalBinding : activeRegionBindings ) {
List<GenomeLoc> intervals = intervalBinding.getIntervals(this.getToolkit());
if ( intervals.isEmpty() ) {
logger.warn("The interval file " + intervalBinding.getSource() + " contains no intervals that could be parsed.");
}
allIntervals = IntervalUtils.mergeListsBySetOperator(intervals, allIntervals, IntervalSetRule.UNION);
}
presetActiveRegions = IntervalUtils.sortAndMergeIntervals(this.getToolkit().getGenomeLocParser(), allIntervals, IntervalMergingRule.ALL);
}
// Do we actually want to operate on the context? // Do we actually want to operate on the context?
public boolean filter(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { public boolean filter(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
return true; // We are keeping all the reads return true; // We are keeping all the reads

View File

@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation;
import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.MendelianViolation;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
@ -23,7 +24,7 @@ import java.util.*;
* Time: 12:24 PM * Time: 12:24 PM
* To change this template use File | Settings | File Templates. * To change this template use File | Settings | File Templates.
*/ */
public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation { public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation {
private MendelianViolation mendelianViolation = null; private MendelianViolation mendelianViolation = null;
private String motherId; private String motherId;

View File

@ -32,7 +32,6 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.BaseUtils;
@ -84,7 +83,6 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
@ArgumentCollection @ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
public RodBinding<VariantContext> getVariantRodBinding() { return variantCollection.variants; }
/** /**
* The INFO field will be annotated with information on the most biologically-significant effect * The INFO field will be annotated with information on the most biologically-significant effect
@ -163,6 +161,13 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
@Argument(fullName="list", shortName="ls", doc="List the available annotations and exit") @Argument(fullName="list", shortName="ls", doc="List the available annotations and exit")
protected Boolean LIST = false; protected Boolean LIST = false;
/**
* By default, the dbSNP ID is added only when the ID field in the variant VCF is empty.
*/
@Argument(fullName="alwaysAppendDbsnpId", shortName="alwaysAppendDbsnpId", doc="In conjunction with the dbSNP binding, append the dbSNP ID even when the variant VCF already has the ID field populated")
protected Boolean ALWAYS_APPEND_DBSNP_ID = false;
public boolean alwaysAppendDbsnpId() { return ALWAYS_APPEND_DBSNP_ID; }
@Hidden @Hidden
@Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false) @Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false)
protected boolean indelsOnly = false; protected boolean indelsOnly = false;

View File

@ -195,11 +195,20 @@ public class VariantAnnotatorEngine {
private VariantContext annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map<String, Object> infoAnnotations) { private VariantContext annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map<String, Object> infoAnnotations) {
for ( Map.Entry<RodBinding<VariantContext>, String> dbSet : dbAnnotations.entrySet() ) { for ( Map.Entry<RodBinding<VariantContext>, String> dbSet : dbAnnotations.entrySet() ) {
if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) { if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) {
String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType()); final String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType());
// put the DB key into the INFO field
infoAnnotations.put(VCFConstants.DBSNP_KEY, rsID != null); infoAnnotations.put(VCFConstants.DBSNP_KEY, rsID != null);
// annotate dbsnp id if available and not already there
if ( rsID != null && vc.emptyID() ) // add the ID if appropriate
vc = new VariantContextBuilder(vc).id(rsID).make(); if ( rsID != null ) {
if ( vc.emptyID() ) {
vc = new VariantContextBuilder(vc).id(rsID).make();
} else if ( walker.alwaysAppendDbsnpId() && vc.getID().indexOf(rsID) == -1 ) {
final String newRsID = vc.getID() + VCFConstants.ID_FIELD_SEPARATOR + rsID;
vc = new VariantContextBuilder(vc).id(newRsID).make();
}
}
} else { } else {
boolean overlapsComp = false; boolean overlapsComp = false;
for ( VariantContext comp : tracker.getValues(dbSet.getKey(), ref.getLocus()) ) { for ( VariantContext comp : tracker.getValues(dbSet.getKey(), ref.getLocus()) ) {

View File

@ -8,9 +8,9 @@ import java.util.List;
public interface AnnotatorCompatibleWalker { public interface AnnotatorCompatibleWalker {
// getter methods for various used bindings // getter methods for various used bindings
public abstract RodBinding<VariantContext> getVariantRodBinding();
public abstract RodBinding<VariantContext> getSnpEffRodBinding(); public abstract RodBinding<VariantContext> getSnpEffRodBinding();
public abstract RodBinding<VariantContext> getDbsnpRodBinding(); public abstract RodBinding<VariantContext> getDbsnpRodBinding();
public abstract List<RodBinding<VariantContext>> getCompRodBindings(); public abstract List<RodBinding<VariantContext>> getCompRodBindings();
public abstract List<RodBinding<VariantContext>> getResourceRodBindings(); public abstract List<RodBinding<VariantContext>> getResourceRodBindings();
public abstract boolean alwaysAppendDbsnpId();
} }

View File

@ -39,7 +39,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
super(UAC, N, logger, verboseWriter); super(UAC, N, logger, verboseWriter);
} }
@ -166,7 +165,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
final int numChr = 2*numSamples; final int numChr = 2*numSamples;
// queue of AC conformations to process // queue of AC conformations to process
final Queue<ExactACset> ACqueue = new LinkedList<ExactACset>(); final LinkedList<ExactACset> ACqueue = new LinkedList<ExactACset>();
// mapping of ExactACset indexes to the objects // mapping of ExactACset indexes to the objects
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>(numChr+1); final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>(numChr+1);
@ -177,11 +176,11 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
ACqueue.add(zeroSet); ACqueue.add(zeroSet);
indexesToACset.put(zeroSet.ACcounts, zeroSet); indexesToACset.put(zeroSet.ACcounts, zeroSet);
// optimization: create the temporary storage for computing L(j,k) just once // optimization: create the temporary storage for computing L(j,k) just once
final int maxPossibleDependencies = numAlternateAlleles + (numAlternateAlleles * (numAlternateAlleles + 1) / 2) + 1; final int maxPossibleDependencies = numAlternateAlleles + (numAlternateAlleles * (numAlternateAlleles + 1) / 2) + 1;
final double[][] tempLog10ConformationLikelihoods = new double[numSamples+1][maxPossibleDependencies]; final double[][] tempLog10ConformationLikelihoods = new double[numSamples+1][maxPossibleDependencies];
for ( int i = 0; i < maxPossibleDependencies; i++ ) for ( int i = 0; i < maxPossibleDependencies; i++ )
tempLog10ConformationLikelihoods[0][i] = Double.NEGATIVE_INFINITY; tempLog10ConformationLikelihoods[0][i] = Double.NEGATIVE_INFINITY;
// keep processing while we have AC conformations that need to be calculated // keep processing while we have AC conformations that need to be calculated
double maxLog10L = Double.NEGATIVE_INFINITY; double maxLog10L = Double.NEGATIVE_INFINITY;
@ -195,16 +194,26 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
} }
} }
private static final class DependentSet {
public final int[] ACcounts;
public final int PLindex;
public DependentSet(final int[] ACcounts, final int PLindex) {
this.ACcounts = ACcounts;
this.PLindex = PLindex;
}
}
private static double calculateAlleleCountConformation(final ExactACset set, private static double calculateAlleleCountConformation(final ExactACset set,
final ArrayList<double[]> genotypeLikelihoods, final ArrayList<double[]> genotypeLikelihoods,
final double maxLog10L, final double maxLog10L,
final int numChr, final int numChr,
final boolean preserveData, final boolean preserveData,
final Queue<ExactACset> ACqueue, final LinkedList<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset, final HashMap<ExactACcounts, ExactACset> indexesToACset,
final double[][] log10AlleleFrequencyPriors, final double[][] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result, final AlleleFrequencyCalculationResult result,
final double[][] tempLog10ConformationLikelihoods) { final double[][] tempLog10ConformationLikelihoods) {
//if ( DEBUG ) //if ( DEBUG )
// System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts); // System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
@ -215,7 +224,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// clean up memory // clean up memory
if ( !preserveData ) { if ( !preserveData ) {
for ( ExactACcounts index : set.dependentACsetsToDelete ) { for ( ExactACcounts index : set.dependentACsetsToDelete ) {
indexesToACset.put(index, null); indexesToACset.remove(index);
//if ( DEBUG ) //if ( DEBUG )
// System.out.printf(" *** removing used set=%s after seeing final dependent set=%s%n", index, set.ACcounts); // System.out.printf(" *** removing used set=%s after seeing final dependent set=%s%n", index, set.ACcounts);
} }
@ -230,7 +239,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// no reason to keep this data around because nothing depends on it // no reason to keep this data around because nothing depends on it
if ( !preserveData ) if ( !preserveData )
indexesToACset.put(set.ACcounts, null); indexesToACset.remove(set.ACcounts);
return log10LofK; return log10LofK;
} }
@ -240,7 +249,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
return log10LofK; return log10LofK;
ExactACset lastSet = null; // keep track of the last set placed in the queue so that we can tell it to clean us up when done processing
final int numAltAlleles = set.ACcounts.getCounts().length; final int numAltAlleles = set.ACcounts.getCounts().length;
// genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods. // genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods.
@ -251,30 +259,40 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
for ( int allele = 0; allele < numAltAlleles; allele++ ) { for ( int allele = 0; allele < numAltAlleles; allele++ ) {
final int[] ACcountsClone = set.ACcounts.getCounts().clone(); final int[] ACcountsClone = set.ACcounts.getCounts().clone();
ACcountsClone[allele]++; ACcountsClone[allele]++;
lastSet = updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset); updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset);
} }
// add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different
if ( ACwiggle > 1 ) { if ( ACwiggle > 1 ) {
final ArrayList<DependentSet> differentAlleles = new ArrayList<DependentSet>(numAltAlleles * numAltAlleles);
final ArrayList<DependentSet> sameAlleles = new ArrayList<DependentSet>(numAltAlleles);
for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) { for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) {
for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) { for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) {
final int[] ACcountsClone = set.ACcounts.getCounts().clone(); final int[] ACcountsClone = set.ACcounts.getCounts().clone();
ACcountsClone[allele_i]++; ACcountsClone[allele_i]++;
ACcountsClone[allele_j]++; ACcountsClone[allele_j]++;
lastSet = updateACset(ACcountsClone, numChr, set, ++PLindex , ACqueue, indexesToACset);
if ( allele_i == allele_j )
sameAlleles.add(new DependentSet(ACcountsClone, ++PLindex));
else
differentAlleles.add(new DependentSet(ACcountsClone, ++PLindex));
} }
} }
// IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering
for ( DependentSet dependent : differentAlleles )
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset);
for ( DependentSet dependent : sameAlleles )
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset);
} }
// if the last dependent set was not at the back of the queue (i.e. not just added), then we need to iterate // determine which is the last dependent set in the queue (not necessarily the last one added above) so we can know when it is safe to clean up this column
// over all the dependent sets to find the last one in the queue (otherwise it will be cleaned up too early) if ( !preserveData ) {
if ( !preserveData && lastSet == null ) { final ExactACset lastSet = determineLastDependentSetInQueue(set.ACcounts, ACqueue);
//if ( DEBUG ) if ( lastSet != null )
// System.out.printf(" *** iterating over dependent sets for set=%s%n", set.ACcounts); lastSet.dependentACsetsToDelete.add(set.ACcounts);
lastSet = determineLastDependentSetInQueue(set.ACcounts, ACqueue);
} }
if ( lastSet != null )
lastSet.dependentACsetsToDelete.add(set.ACcounts);
return log10LofK; return log10LofK;
} }
@ -282,34 +300,36 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and // adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
// also adds it as a dependency to the given callingSetIndex. // also adds it as a dependency to the given callingSetIndex.
// returns the ExactACset if that set was not already in the queue and null otherwise. // returns the ExactACset if that set was not already in the queue and null otherwise.
private static ExactACset updateACset(final int[] ACcounts, private static void updateACset(final int[] ACcounts,
final int numChr, final int numChr,
final ExactACset callingSet, final ExactACset callingSet,
final int PLsetIndex, final int PLsetIndex,
final Queue<ExactACset> ACqueue, final Queue<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset) { final HashMap<ExactACcounts, ExactACset> indexesToACset) {
final ExactACcounts index = new ExactACcounts(ACcounts); final ExactACcounts index = new ExactACcounts(ACcounts);
boolean wasInQueue = true;
if ( !indexesToACset.containsKey(index) ) { if ( !indexesToACset.containsKey(index) ) {
ExactACset set = new ExactACset(numChr/2 +1, index); ExactACset set = new ExactACset(numChr/2 +1, index);
indexesToACset.put(index, set); indexesToACset.put(index, set);
ACqueue.add(set); ACqueue.add(set);
wasInQueue = false;
} }
// add the given dependency to the set // add the given dependency to the set
//if ( DEBUG )
// System.out.println(" *** adding dependency from " + index + " to " + callingSet.ACcounts);
final ExactACset set = indexesToACset.get(index); final ExactACset set = indexesToACset.get(index);
set.ACsetIndexToPLIndex.put(callingSet.ACcounts, PLsetIndex); set.ACsetIndexToPLIndex.put(callingSet.ACcounts, PLsetIndex);
return wasInQueue ? null : set;
} }
private static ExactACset determineLastDependentSetInQueue(final ExactACcounts callingSetIndex, final Queue<ExactACset> ACqueue) { private static ExactACset determineLastDependentSetInQueue(final ExactACcounts callingSetIndex, final LinkedList<ExactACset> ACqueue) {
ExactACset set = null; Iterator<ExactACset> reverseIterator = ACqueue.descendingIterator();
for ( ExactACset queued : ACqueue ) { while ( reverseIterator.hasNext() ) {
if ( queued.dependentACsetsToDelete.contains(callingSetIndex) ) final ExactACset queued = reverseIterator.next();
set = queued; if ( queued.ACsetIndexToPLIndex.containsKey(callingSetIndex) )
return queued;
} }
return set;
// shouldn't get here
throw new ReviewedStingException("Error: no sets in the queue currently hold " + callingSetIndex + " as a dependent!");
} }
private static void computeLofK(final ExactACset set, private static void computeLofK(final ExactACset set,
@ -317,7 +337,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
final HashMap<ExactACcounts, ExactACset> indexesToACset, final HashMap<ExactACcounts, ExactACset> indexesToACset,
final double[][] log10AlleleFrequencyPriors, final double[][] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result, final AlleleFrequencyCalculationResult result,
final double[][] tempLog10ConformationLikelihoods) { final double[][] tempLog10ConformationLikelihoods) {
set.log10Likelihoods[0] = 0.0; // the zero case set.log10Likelihoods[0] = 0.0; // the zero case
final int totalK = set.getACsum(); final int totalK = set.getACsum();
@ -329,40 +349,40 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
} }
// k > 0 for at least one k // k > 0 for at least one k
else { else {
// deal with the non-AA possible conformations // deal with the non-AA possible conformations
int conformationIndex = 1; int conformationIndex = 1;
for ( Map.Entry<ExactACcounts, Integer> mapping : set.ACsetIndexToPLIndex.entrySet() ) { for ( Map.Entry<ExactACcounts, Integer> mapping : set.ACsetIndexToPLIndex.entrySet() ) {
//if ( DEBUG ) //if ( DEBUG )
// System.out.printf(" *** evaluating set=%s which depends on set=%s%n", set.ACcounts, mapping.getKey()); // System.out.printf(" *** evaluating set=%s which depends on set=%s%n", set.ACcounts, mapping.getKey());
ExactACset dependent = indexesToACset.get(mapping.getKey()); ExactACset dependent = indexesToACset.get(mapping.getKey());
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
if ( totalK <= 2*j ) { // skip impossible conformations if ( totalK <= 2*j ) { // skip impossible conformations
final double[] gl = genotypeLikelihoods.get(j); final double[] gl = genotypeLikelihoods.get(j);
tempLog10ConformationLikelihoods[j][conformationIndex] = tempLog10ConformationLikelihoods[j][conformationIndex] =
determineCoefficient(mapping.getValue(), j, set.ACcounts.getCounts(), totalK) + dependent.log10Likelihoods[j-1] + gl[mapping.getValue()]; determineCoefficient(mapping.getValue(), j, set.ACcounts.getCounts(), totalK) + dependent.log10Likelihoods[j-1] + gl[mapping.getValue()];
} else { } else {
tempLog10ConformationLikelihoods[j][conformationIndex] = Double.NEGATIVE_INFINITY; tempLog10ConformationLikelihoods[j][conformationIndex] = Double.NEGATIVE_INFINITY;
} }
} }
conformationIndex++; conformationIndex++;
} }
// finally, deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value // finally, deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
final int numPaths = set.ACsetIndexToPLIndex.size() + 1; final int numPaths = set.ACsetIndexToPLIndex.size() + 1;
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
if ( totalK < 2*j-1 ) { if ( totalK < 2*j-1 ) {
final double[] gl = genotypeLikelihoods.get(j); final double[] gl = genotypeLikelihoods.get(j);
tempLog10ConformationLikelihoods[j][0] = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX]; tempLog10ConformationLikelihoods[j][0] = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
} else { } else {
tempLog10ConformationLikelihoods[j][0] = Double.NEGATIVE_INFINITY; tempLog10ConformationLikelihoods[j][0] = Double.NEGATIVE_INFINITY;
} }
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
final double log10Max = MathUtils.approximateLog10SumLog10(tempLog10ConformationLikelihoods[j], numPaths); final double log10Max = MathUtils.approximateLog10SumLog10(tempLog10ConformationLikelihoods[j], numPaths);
set.log10Likelihoods[j] = log10Max - logDenominator; set.log10Likelihoods[j] = log10Max - logDenominator;
} }

View File

@ -1,114 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.HashSet;
import java.util.Set;
/**
* Uses the UG engine to determine per-sample genotype likelihoods and emits them as a VCF (using PLs).
* Absolutely not supported or recommended for public use.
* Run this as you would the UnifiedGenotyper, except that you must additionally pass in a VCF bound to
* the name 'allele' so we know which alternate allele to use at each site.
*/
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_INPUT)
@Reference(window=@Window(start=-200,stop=200))
@By(DataSource.READS)
@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250)
public class UGCalcLikelihoods extends LocusWalker<VariantCallContext, Integer> implements TreeReducible<Integer> {
@ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
// control the output
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter writer = null;
// the calculation arguments
private UnifiedGenotyperEngine UG_engine = null;
// enable deletions in the pileup
public boolean includeReadsWithDeletionAtLoci() { return true; }
// enable extended events for indels
public boolean generateExtendedEvents() { return UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.SNP; }
public void initialize() {
// get all of the unique sample names
Set<String> samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples);
// initialize the header
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
writer.writeHeader(new VCFHeader(headerInfo, samples)) ;
}
public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
VariantContext call = UG_engine.calculateLikelihoods(tracker, refContext, rawContext);
return call == null ? null : new VariantCallContext(call, true);
}
public Integer reduceInit() { return 0; }
public Integer treeReduce(Integer lhs, Integer rhs) {
return lhs + rhs;
}
public Integer reduce(VariantCallContext value, Integer sum) {
if ( value == null )
return sum;
try {
writer.add(value);
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name");
}
return sum + 1;
}
public void onTraversalDone(Integer sum) {
logger.info(String.format("Visited bases: %d", sum));
}
}

View File

@ -1,152 +0,0 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
/**
* Uses the UG engine to call variants based off of VCFs annotated with GLs (or PLs).
* Absolutely not supported or recommended for public use.
* Run this as you would the UnifiedGenotyper, except that instead of '-I reads' it expects any number
* of GL/PL-annotated VCFs bound to a name starting with 'variant'.
*/
public class UGCallVariants extends RodWalker<VariantCallContext, Integer> {
@ArgumentCollection
private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
@Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true)
public List<RodBinding<VariantContext>> variants;
// control the output
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter writer = null;
// the calculation arguments
private UnifiedGenotyperEngine UG_engine = null;
// variant track names
private Set<String> trackNames = new HashSet<String>();
public void initialize() {
for ( RodBinding<VariantContext> rb : variants )
trackNames.add(rb.getName());
Set<String> samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), trackNames);
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples);
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, -1, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed"));
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, -1, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed"));
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING )
headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality"));
// initialize the header
writer.writeHeader(new VCFHeader(headerInfo, samples));
}
public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null )
return null;
List<VariantContext> VCs = tracker.getValues(variants, context.getLocation());
VariantContext mergedVC = mergeVCsWithGLs(VCs);
if ( mergedVC == null )
return null;
return UG_engine.calculateGenotypes(tracker, ref, context, mergedVC);
}
public Integer reduceInit() { return 0; }
public Integer reduce(VariantCallContext value, Integer sum) {
if ( value == null )
return sum;
try {
VariantContextBuilder builder = new VariantContextBuilder(value);
VariantContextUtils.calculateChromosomeCounts(builder, true);
writer.add(builder.make());
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name");
}
return sum + 1;
}
public void onTraversalDone(Integer result) {
logger.info(String.format("Visited sites: %d", result));
}
private static VariantContext mergeVCsWithGLs(List<VariantContext> VCs) {
// we can't use the VCUtils classes because our VCs can all be no-calls
if ( VCs.size() == 0 )
return null;
VariantContext variantVC = null;
GenotypesContext genotypes = GenotypesContext.create();
for ( VariantContext vc : VCs ) {
if ( variantVC == null && vc.isVariant() )
variantVC = vc;
genotypes.addAll(getGenotypesWithGLs(vc.getGenotypes()));
}
if ( variantVC == null ) {
VariantContext vc = VCs.get(0);
throw new UserException("There is no ALT allele in any of the VCF records passed in at " + vc.getChr() + ":" + vc.getStart());
}
return new VariantContextBuilder(variantVC).source("VCwithGLs").genotypes(genotypes).make();
}
private static GenotypesContext getGenotypesWithGLs(GenotypesContext genotypes) {
GenotypesContext genotypesWithGLs = GenotypesContext.create(genotypes.size());
for ( final Genotype g : genotypes ) {
if ( g.hasLikelihoods() && g.getLikelihoods().getAsVector() != null )
genotypesWithGLs.add(g);
}
return genotypesWithGLs;
}
}

View File

@ -126,10 +126,10 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
@ArgumentCollection @ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
public RodBinding<VariantContext> getDbsnpRodBinding() { return dbsnp.dbsnp; } public RodBinding<VariantContext> getDbsnpRodBinding() { return dbsnp.dbsnp; }
public RodBinding<VariantContext> getVariantRodBinding() { return null; }
public RodBinding<VariantContext> getSnpEffRodBinding() { return null; } public RodBinding<VariantContext> getSnpEffRodBinding() { return null; }
public List<RodBinding<VariantContext>> getCompRodBindings() { return Collections.emptyList(); } public List<RodBinding<VariantContext>> getCompRodBindings() { return Collections.emptyList(); }
public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); } public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); }
public boolean alwaysAppendDbsnpId() { return false; }
/** /**
* A raw, unfiltered, highly specific callset in VCF format. * A raw, unfiltered, highly specific callset in VCF format.
@ -205,6 +205,12 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
* *
**/ **/
public void initialize() { public void initialize() {
// warn the user for misusing EMIT_ALL_SITES
if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES &&
UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY &&
UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.SNP )
logger.warn("WARNING: note that the EMIT_ALL_SITES option is intended only for point mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by no means produce a comprehensive set of indels in DISCOVERY mode");
// get all of the unique sample names // get all of the unique sample names
Set<String> samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); Set<String> samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());

View File

@ -54,8 +54,9 @@ public class UnifiedGenotyperEngine {
EMIT_VARIANTS_ONLY, EMIT_VARIANTS_ONLY,
/** produces calls at variant sites and confident reference sites */ /** produces calls at variant sites and confident reference sites */
EMIT_ALL_CONFIDENT_SITES, EMIT_ALL_CONFIDENT_SITES,
/** produces calls at any callable site regardless of confidence; this argument is intended for point /** produces calls at any callable site regardless of confidence; this argument is intended only for point
* mutations (SNPs) only and while some indel calls may be produced they are by no means comprehensive */ * mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by
* no means produce a comprehensive set of indels in DISCOVERY mode */
EMIT_ALL_SITES EMIT_ALL_SITES
} }

View File

@ -0,0 +1,233 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
@Analysis(description = "Evaluation summary for multi-allelic variants")
public class MultiallelicSummary extends VariantEvaluator { // implements StandardEval {
final protected static Logger logger = Logger.getLogger(MultiallelicSummary.class);
public enum Type {
SNP, INDEL
}
// basic counts on various rates found
@DataPoint(description = "Number of processed loci")
public long nProcessedLoci = 0;
@DataPoint(description = "Number of SNPs")
public int nSNPs = 0;
@DataPoint(description = "Number of multi-allelic SNPs")
public int nMultiSNPs = 0;
@DataPoint(description = "% processed sites that are multi-allelic SNPs", format = "%.5f")
public double processedMultiSnpRatio = 0;
@DataPoint(description = "% SNP sites that are multi-allelic", format = "%.3f")
public double variantMultiSnpRatio = 0;
@DataPoint(description = "Number of Indels")
public int nIndels = 0;
@DataPoint(description = "Number of multi-allelic Indels")
public int nMultiIndels = 0;
@DataPoint(description = "% processed sites that are multi-allelic Indels", format = "%.5f")
public double processedMultiIndelRatio = 0;
@DataPoint(description = "% Indel sites that are multi-allelic", format = "%.3f")
public double variantMultiIndelRatio = 0;
@DataPoint(description = "Number of Transitions")
public int nTi = 0;
@DataPoint(description = "Number of Transversions")
public int nTv = 0;
@DataPoint(description = "Overall TiTv ratio", format = "%.2f")
public double TiTvRatio = 0;
@DataPoint(description = "Multi-allelic SNPs partially known")
public int knownSNPsPartial = 0;
@DataPoint(description = "Multi-allelic SNPs completely known")
public int knownSNPsComplete = 0;
@DataPoint(description = "Multi-allelic SNP Novelty Rate")
public String SNPNoveltyRate = "NA";
@DataPoint(description = "Multi-allelic Indels partially known")
public int knownIndelsPartial = 0;
@DataPoint(description = "Multi-allelic Indels completely known")
public int knownIndelsComplete = 0;
@DataPoint(description = "Multi-allelic Indel Novelty Rate")
public String indelNoveltyRate = "NA";
@DataPoint(description="Histogram of allele frequencies")
AFHistogram AFhistogram = new AFHistogram();
/*
* AF histogram table object
*/
static class AFHistogram implements TableType {
private Object[] colKeys, rowKeys = {"pairwise_AF"};
private int[] AFhistogram;
private static final double AFincrement = 0.01;
private static final int numBins = (int)(1.00 / AFincrement);
public AFHistogram() {
colKeys = initColKeys();
AFhistogram = new int[colKeys.length];
}
public Object[] getColumnKeys() {
return colKeys;
}
public Object[] getRowKeys() {
return rowKeys;
}
public Object getCell(int row, int col) {
return AFhistogram[col];
}
private static Object[] initColKeys() {
ArrayList<String> keyList = new ArrayList<String>(numBins + 1);
for ( double a = 0.00; a <= 1.01; a += AFincrement ) {
keyList.add(String.format("%.2f", a));
}
return keyList.toArray();
}
public String getName() { return "AFHistTable"; }
public void update(VariantContext vc) {
final Object obj = vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY, null);
if ( obj == null || !(obj instanceof List) )
return;
List<String> list = (List<String>)obj;
for ( String str : list ) {
final double AF = Double.valueOf(str);
final int bin = (int)(numBins * MathUtils.round(AF, 2));
AFhistogram[bin]++;
}
}
}
public void initialize(VariantEvalWalker walker) {}
@Override public boolean enabled() { return true; }
public int getComparisonOrder() {
return 2;
}
public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1);
}
public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( eval == null || eval.isMonomorphicInSamples() )
return null;
// update counts
switch ( eval.getType() ) {
case SNP:
nSNPs++;
if ( !eval.isBiallelic() ) {
nMultiSNPs++;
calculatePairwiseTiTv(eval);
calculateSNPPairwiseNovelty(eval, comp);
}
break;
case INDEL:
nIndels++;
if ( !eval.isBiallelic() ) {
nMultiIndels++;
calculateIndelPairwiseNovelty(eval, comp);
}
break;
default:
throw new UserException.BadInput("Unexpected variant context type: " + eval);
}
AFhistogram.update(eval);
return null; // we don't capture any interesting sites
}
private void calculatePairwiseTiTv(VariantContext vc) {
for ( Allele alt : vc.getAlternateAlleles() ) {
if ( VariantContextUtils.isTransition(vc.getReference(), alt) )
nTi++;
else
nTv++;
}
}
private void calculateSNPPairwiseNovelty(VariantContext eval, VariantContext comp) {
if ( comp == null )
return;
int knownAlleles = 0;
for ( Allele alt : eval.getAlternateAlleles() ) {
if ( comp.getAlternateAlleles().contains(alt) )
knownAlleles++;
}
if ( knownAlleles == eval.getAlternateAlleles().size() )
knownSNPsComplete++;
else if ( knownAlleles > 0 )
knownSNPsPartial++;
}
private void calculateIndelPairwiseNovelty(VariantContext eval, VariantContext comp) {
}
private final String noveltyRate(final int all, final int known) {
final int novel = all - known;
final double rate = (novel / (1.0 * all));
return all == 0 ? "NA" : String.format("%.2f", rate);
}
public void finalizeEvaluation() {
processedMultiSnpRatio = (double)nMultiSNPs / (double)nProcessedLoci;
variantMultiSnpRatio = (double)nMultiSNPs / (double)nSNPs;
processedMultiIndelRatio = (double)nMultiIndels / (double)nProcessedLoci;
variantMultiIndelRatio = (double)nMultiIndels / (double)nIndels;
TiTvRatio = (double)nTi / (double)nTv;
SNPNoveltyRate = noveltyRate(nMultiSNPs, knownSNPsPartial + knownSNPsComplete);
indelNoveltyRate = noveltyRate(nMultiSNPs, knownIndelsPartial + knownIndelsComplete);
}
}

View File

@ -120,6 +120,10 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
@Argument(shortName="filteredRecordsMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields", required=false) @Argument(shortName="filteredRecordsMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields", required=false)
public VariantContextUtils.FilteredRecordMergeType filteredRecordsMergeType = VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED; public VariantContextUtils.FilteredRecordMergeType filteredRecordsMergeType = VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED;
@Hidden
@Argument(shortName="multipleAllelesMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different allele types (for example, SNP vs. indel)", required=false)
public VariantContextUtils.MultipleAllelesMergeType multipleAllelesMergeType = VariantContextUtils.MultipleAllelesMergeType.BY_TYPE;
/** /**
* Used when taking the union of variants that contain genotypes. A complete priority list MUST be provided. * Used when taking the union of variants that contain genotypes. A complete priority list MUST be provided.
*/ */
@ -236,13 +240,24 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
return 0; return 0;
List<VariantContext> mergedVCs = new ArrayList<VariantContext>(); List<VariantContext> mergedVCs = new ArrayList<VariantContext>();
Map<VariantContext.Type, List<VariantContext>> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs);
// iterate over the types so that it's deterministic if (multipleAllelesMergeType == VariantContextUtils.MultipleAllelesMergeType.BY_TYPE) {
for ( VariantContext.Type type : VariantContext.Type.values() ) { Map<VariantContext.Type, List<VariantContext>> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs);
if ( VCsByType.containsKey(type) ) // iterate over the types so that it's deterministic
mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type), for (VariantContext.Type type : VariantContext.Type.values()) {
priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, if (VCsByType.containsKey(type))
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type),
priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
}
}
else if (multipleAllelesMergeType == VariantContextUtils.MultipleAllelesMergeType.MIX_TYPES) {
mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), vcs,
priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
}
else {
logger.warn("Ignoring all records at site " + ref.getLocus());
} }
for ( VariantContext mergedVC : mergedVCs ) { for ( VariantContext mergedVC : mergedVCs ) {

View File

@ -24,6 +24,7 @@
package org.broadinstitute.sting.utils.codecs.vcf; package org.broadinstitute.sting.utils.codecs.vcf;
import org.broad.tribble.TribbleException;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Arrays; import java.util.Arrays;
@ -149,7 +150,11 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF
count = Integer.valueOf(numberStr); count = Integer.valueOf(numberStr);
} }
type = VCFHeaderLineType.valueOf(mapping.get("Type")); try {
type = VCFHeaderLineType.valueOf(mapping.get("Type"));
} catch (Exception e) {
throw new TribbleException(mapping.get("Type") + " is not a valid type in the VCF specification (note that types are case-sensitive)");
}
if (type == VCFHeaderLineType.Flag && !allowFlagValues()) if (type == VCFHeaderLineType.Flag && !allowFlagValues())
throw new IllegalArgumentException("Flag is an unsupported type for this kind of field"); throw new IllegalArgumentException("Flag is an unsupported type for this kind of field");

View File

@ -29,6 +29,7 @@ import org.apache.commons.jexl2.Expression;
import org.apache.commons.jexl2.JexlEngine; import org.apache.commons.jexl2.JexlEngine;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broad.tribble.util.popgen.HardyWeinbergCalculation; import org.broad.tribble.util.popgen.HardyWeinbergCalculation;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
@ -471,6 +472,18 @@ public class VariantContextUtils {
KEEP_UNCONDITIONAL KEEP_UNCONDITIONAL
} }
@Hidden
public enum MultipleAllelesMergeType {
/**
* Combine only alleles of the same type (SNP, indel, etc.) into a single VCF record.
*/
BY_TYPE,
/**
* Merge all allele types at the same start position into the same VCF record.
*/
MIX_TYPES
}
/** /**
* Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided.
* If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with * If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with
@ -1060,6 +1073,14 @@ public class VariantContextUtils {
return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSVERSION; return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSVERSION;
} }
public static boolean isTransition(Allele ref, Allele alt) {
return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSITION;
}
public static boolean isTransversion(Allele ref, Allele alt) {
return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSVERSION;
}
/** /**
* create a genome location, given a variant context * create a genome location, given a variant context
* @param genomeLocParser parser * @param genomeLocParser parser

View File

@ -110,6 +110,14 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
executeTest("getting DB tag with dbSNP", spec); executeTest("getting DB tag with dbSNP", spec);
} }
@Test
public void testMultipleIdsWithDbsnp() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --alwaysAppendDbsnpId --dbsnp " + b36dbSNP129 + " -G Standard --variant " + validationDataLocation + "vcfexample3withIDs.vcf -L " + validationDataLocation + "vcfexample3withIDs.vcf", 1,
Arrays.asList("cd7e3d43b8f5579c461b3e588a295fa8"));
executeTest("adding multiple IDs with dbSNP", spec);
}
@Test @Test
public void testDBTagWithHapMap() { public void testDBTagWithHapMap() {
WalkerTestSpec spec = new WalkerTestSpec( WalkerTestSpec spec = new WalkerTestSpec(

View File

@ -294,7 +294,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec(
baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation +
"phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1,
Arrays.asList("877de5b0cc61dc54636062df6399b978")); Arrays.asList("1d1956fd7b0f0d30935674b2f5019860"));
executeTest("test MultiSample Phase1 indels with complicated records", spec4); executeTest("test MultiSample Phase1 indels with complicated records", spec4);
} }