Merge branch 'master' of ssh://chartl@ni.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
9d4b84f6bd
|
|
@ -92,7 +92,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
|
||||
// Call the walkers isActive function for this locus and add them to the list to be integrated later
|
||||
if( initialIntervals.overlaps(location) ) {
|
||||
final boolean isActive = walker.isActive( tracker, refContext, locus );
|
||||
final boolean isActive = ( walker.presetActiveRegions == null ? walker.isActive( tracker, refContext, locus ) : walker.presetActiveRegions.overlaps(location) );
|
||||
isActiveList.add( new ActiveRegion(location, isActive, engine.getGenomeLocParser(), activeRegionExtension ) );
|
||||
}
|
||||
|
||||
|
|
@ -109,7 +109,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
if( !locusView.hasNext() ) {
|
||||
// Call the walkers isActive function for this locus and add them to the list to be integrated later
|
||||
if( initialIntervals.overlaps(location) ) {
|
||||
final boolean isActive = walker.isActive( tracker, refContext, locus );
|
||||
final boolean isActive = ( walker.presetActiveRegions == null ? walker.isActive( tracker, refContext, locus ) : walker.presetActiveRegions.overlaps(location) );
|
||||
isActiveList.add( new ActiveRegion(location, isActive, engine.getGenomeLocParser(), activeRegionExtension ) );
|
||||
}
|
||||
|
||||
|
|
@ -128,7 +128,16 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
// add these blocks of work to the work queue
|
||||
final ArrayList<ActiveRegion> activeRegions = integrateActiveList( isActiveList );
|
||||
logger.debug("Integrated " + isActiveList.size() + " isActive calls into " + activeRegions.size() + " regions." );
|
||||
workQueue.addAll( activeRegions );
|
||||
if( walker.activeRegionOutStream == null ) {
|
||||
workQueue.addAll( activeRegions );
|
||||
} else { // Just want to output the active regions to a file, not actually process them
|
||||
for( final ActiveRegion activeRegion : activeRegions ) {
|
||||
if( activeRegion.isActive ) {
|
||||
walker.activeRegionOutStream.println( activeRegion.getLocation() );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Since we've sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
|
||||
if( !workQueue.isEmpty() ) {
|
||||
|
|
|
|||
|
|
@ -1,6 +1,11 @@
|
|||
package org.broadinstitute.sting.gatk.walkers;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.IntervalBinding;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter;
|
||||
|
|
@ -14,8 +19,10 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
|
|||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
|
|
@ -32,6 +39,31 @@ import java.util.List;
|
|||
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class})
|
||||
public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
|
||||
|
||||
@Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this interval list file", required = false)
|
||||
public PrintStream activeRegionOutStream = null;
|
||||
|
||||
@Input(fullName="activeRegionIn", shortName="AR", doc="Use this interval list file as the active regions to process", required = false)
|
||||
protected List<IntervalBinding<Feature>> activeRegionBindings = null;
|
||||
|
||||
public GenomeLocSortedSet presetActiveRegions = null;
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
if( activeRegionBindings == null ) { return; }
|
||||
List<GenomeLoc> allIntervals = new ArrayList<GenomeLoc>(0);
|
||||
for ( IntervalBinding intervalBinding : activeRegionBindings ) {
|
||||
List<GenomeLoc> intervals = intervalBinding.getIntervals(this.getToolkit());
|
||||
|
||||
if ( intervals.isEmpty() ) {
|
||||
logger.warn("The interval file " + intervalBinding.getSource() + " contains no intervals that could be parsed.");
|
||||
}
|
||||
|
||||
allIntervals = IntervalUtils.mergeListsBySetOperator(intervals, allIntervals, IntervalSetRule.UNION);
|
||||
}
|
||||
|
||||
presetActiveRegions = IntervalUtils.sortAndMergeIntervals(this.getToolkit().getGenomeLocParser(), allIntervals, IntervalMergingRule.ALL);
|
||||
}
|
||||
|
||||
// Do we actually want to operate on the context?
|
||||
public boolean filter(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
|
||||
return true; // We are keeping all the reads
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.samples.SampleDB;
|
|||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation;
|
||||
import org.broadinstitute.sting.utils.MendelianViolation;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
|
|
@ -23,7 +24,7 @@ import java.util.*;
|
|||
* Time: 12:24 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation {
|
||||
public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation {
|
||||
|
||||
private MendelianViolation mendelianViolation = null;
|
||||
private String motherId;
|
||||
|
|
|
|||
|
|
@ -32,7 +32,6 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.samples.SampleDB;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
|
@ -84,7 +83,6 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
|
||||
@ArgumentCollection
|
||||
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||
public RodBinding<VariantContext> getVariantRodBinding() { return variantCollection.variants; }
|
||||
|
||||
/**
|
||||
* The INFO field will be annotated with information on the most biologically-significant effect
|
||||
|
|
@ -163,6 +161,13 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
@Argument(fullName="list", shortName="ls", doc="List the available annotations and exit")
|
||||
protected Boolean LIST = false;
|
||||
|
||||
/**
|
||||
* By default, the dbSNP ID is added only when the ID field in the variant VCF is empty.
|
||||
*/
|
||||
@Argument(fullName="alwaysAppendDbsnpId", shortName="alwaysAppendDbsnpId", doc="In conjunction with the dbSNP binding, append the dbSNP ID even when the variant VCF already has the ID field populated")
|
||||
protected Boolean ALWAYS_APPEND_DBSNP_ID = false;
|
||||
public boolean alwaysAppendDbsnpId() { return ALWAYS_APPEND_DBSNP_ID; }
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false)
|
||||
protected boolean indelsOnly = false;
|
||||
|
|
|
|||
|
|
@ -195,11 +195,20 @@ public class VariantAnnotatorEngine {
|
|||
private VariantContext annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map<String, Object> infoAnnotations) {
|
||||
for ( Map.Entry<RodBinding<VariantContext>, String> dbSet : dbAnnotations.entrySet() ) {
|
||||
if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) {
|
||||
String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType());
|
||||
final String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType());
|
||||
|
||||
// put the DB key into the INFO field
|
||||
infoAnnotations.put(VCFConstants.DBSNP_KEY, rsID != null);
|
||||
// annotate dbsnp id if available and not already there
|
||||
if ( rsID != null && vc.emptyID() )
|
||||
vc = new VariantContextBuilder(vc).id(rsID).make();
|
||||
|
||||
// add the ID if appropriate
|
||||
if ( rsID != null ) {
|
||||
if ( vc.emptyID() ) {
|
||||
vc = new VariantContextBuilder(vc).id(rsID).make();
|
||||
} else if ( walker.alwaysAppendDbsnpId() && vc.getID().indexOf(rsID) == -1 ) {
|
||||
final String newRsID = vc.getID() + VCFConstants.ID_FIELD_SEPARATOR + rsID;
|
||||
vc = new VariantContextBuilder(vc).id(newRsID).make();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
boolean overlapsComp = false;
|
||||
for ( VariantContext comp : tracker.getValues(dbSet.getKey(), ref.getLocus()) ) {
|
||||
|
|
|
|||
|
|
@ -8,9 +8,9 @@ import java.util.List;
|
|||
public interface AnnotatorCompatibleWalker {
|
||||
|
||||
// getter methods for various used bindings
|
||||
public abstract RodBinding<VariantContext> getVariantRodBinding();
|
||||
public abstract RodBinding<VariantContext> getSnpEffRodBinding();
|
||||
public abstract RodBinding<VariantContext> getDbsnpRodBinding();
|
||||
public abstract List<RodBinding<VariantContext>> getCompRodBindings();
|
||||
public abstract List<RodBinding<VariantContext>> getResourceRodBindings();
|
||||
public abstract boolean alwaysAppendDbsnpId();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -39,7 +39,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
|
||||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
|
||||
|
||||
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
super(UAC, N, logger, verboseWriter);
|
||||
}
|
||||
|
|
@ -166,7 +165,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
final int numChr = 2*numSamples;
|
||||
|
||||
// queue of AC conformations to process
|
||||
final Queue<ExactACset> ACqueue = new LinkedList<ExactACset>();
|
||||
final LinkedList<ExactACset> ACqueue = new LinkedList<ExactACset>();
|
||||
|
||||
// mapping of ExactACset indexes to the objects
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>(numChr+1);
|
||||
|
|
@ -177,11 +176,11 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
ACqueue.add(zeroSet);
|
||||
indexesToACset.put(zeroSet.ACcounts, zeroSet);
|
||||
|
||||
// optimization: create the temporary storage for computing L(j,k) just once
|
||||
final int maxPossibleDependencies = numAlternateAlleles + (numAlternateAlleles * (numAlternateAlleles + 1) / 2) + 1;
|
||||
final double[][] tempLog10ConformationLikelihoods = new double[numSamples+1][maxPossibleDependencies];
|
||||
for ( int i = 0; i < maxPossibleDependencies; i++ )
|
||||
tempLog10ConformationLikelihoods[0][i] = Double.NEGATIVE_INFINITY;
|
||||
// optimization: create the temporary storage for computing L(j,k) just once
|
||||
final int maxPossibleDependencies = numAlternateAlleles + (numAlternateAlleles * (numAlternateAlleles + 1) / 2) + 1;
|
||||
final double[][] tempLog10ConformationLikelihoods = new double[numSamples+1][maxPossibleDependencies];
|
||||
for ( int i = 0; i < maxPossibleDependencies; i++ )
|
||||
tempLog10ConformationLikelihoods[0][i] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
// keep processing while we have AC conformations that need to be calculated
|
||||
double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||
|
|
@ -195,16 +194,26 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
}
|
||||
}
|
||||
|
||||
private static final class DependentSet {
|
||||
public final int[] ACcounts;
|
||||
public final int PLindex;
|
||||
|
||||
public DependentSet(final int[] ACcounts, final int PLindex) {
|
||||
this.ACcounts = ACcounts;
|
||||
this.PLindex = PLindex;
|
||||
}
|
||||
}
|
||||
|
||||
private static double calculateAlleleCountConformation(final ExactACset set,
|
||||
final ArrayList<double[]> genotypeLikelihoods,
|
||||
final double maxLog10L,
|
||||
final int numChr,
|
||||
final boolean preserveData,
|
||||
final Queue<ExactACset> ACqueue,
|
||||
final LinkedList<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
||||
final double[][] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result,
|
||||
final double[][] tempLog10ConformationLikelihoods) {
|
||||
final double[][] tempLog10ConformationLikelihoods) {
|
||||
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
|
||||
|
|
@ -215,7 +224,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
// clean up memory
|
||||
if ( !preserveData ) {
|
||||
for ( ExactACcounts index : set.dependentACsetsToDelete ) {
|
||||
indexesToACset.put(index, null);
|
||||
indexesToACset.remove(index);
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** removing used set=%s after seeing final dependent set=%s%n", index, set.ACcounts);
|
||||
}
|
||||
|
|
@ -230,7 +239,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
|
||||
// no reason to keep this data around because nothing depends on it
|
||||
if ( !preserveData )
|
||||
indexesToACset.put(set.ACcounts, null);
|
||||
indexesToACset.remove(set.ACcounts);
|
||||
|
||||
return log10LofK;
|
||||
}
|
||||
|
|
@ -240,7 +249,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
|
||||
return log10LofK;
|
||||
|
||||
ExactACset lastSet = null; // keep track of the last set placed in the queue so that we can tell it to clean us up when done processing
|
||||
final int numAltAlleles = set.ACcounts.getCounts().length;
|
||||
|
||||
// genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods.
|
||||
|
|
@ -251,30 +259,40 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
for ( int allele = 0; allele < numAltAlleles; allele++ ) {
|
||||
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
||||
ACcountsClone[allele]++;
|
||||
lastSet = updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset);
|
||||
updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset);
|
||||
}
|
||||
|
||||
// add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different
|
||||
if ( ACwiggle > 1 ) {
|
||||
final ArrayList<DependentSet> differentAlleles = new ArrayList<DependentSet>(numAltAlleles * numAltAlleles);
|
||||
final ArrayList<DependentSet> sameAlleles = new ArrayList<DependentSet>(numAltAlleles);
|
||||
|
||||
for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) {
|
||||
for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) {
|
||||
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
||||
ACcountsClone[allele_i]++;
|
||||
ACcountsClone[allele_j]++;
|
||||
lastSet = updateACset(ACcountsClone, numChr, set, ++PLindex , ACqueue, indexesToACset);
|
||||
|
||||
if ( allele_i == allele_j )
|
||||
sameAlleles.add(new DependentSet(ACcountsClone, ++PLindex));
|
||||
else
|
||||
differentAlleles.add(new DependentSet(ACcountsClone, ++PLindex));
|
||||
}
|
||||
}
|
||||
|
||||
// IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering
|
||||
for ( DependentSet dependent : differentAlleles )
|
||||
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset);
|
||||
for ( DependentSet dependent : sameAlleles )
|
||||
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset);
|
||||
}
|
||||
|
||||
// if the last dependent set was not at the back of the queue (i.e. not just added), then we need to iterate
|
||||
// over all the dependent sets to find the last one in the queue (otherwise it will be cleaned up too early)
|
||||
if ( !preserveData && lastSet == null ) {
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** iterating over dependent sets for set=%s%n", set.ACcounts);
|
||||
lastSet = determineLastDependentSetInQueue(set.ACcounts, ACqueue);
|
||||
// determine which is the last dependent set in the queue (not necessarily the last one added above) so we can know when it is safe to clean up this column
|
||||
if ( !preserveData ) {
|
||||
final ExactACset lastSet = determineLastDependentSetInQueue(set.ACcounts, ACqueue);
|
||||
if ( lastSet != null )
|
||||
lastSet.dependentACsetsToDelete.add(set.ACcounts);
|
||||
}
|
||||
if ( lastSet != null )
|
||||
lastSet.dependentACsetsToDelete.add(set.ACcounts);
|
||||
|
||||
return log10LofK;
|
||||
}
|
||||
|
|
@ -282,34 +300,36 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
|
||||
// also adds it as a dependency to the given callingSetIndex.
|
||||
// returns the ExactACset if that set was not already in the queue and null otherwise.
|
||||
private static ExactACset updateACset(final int[] ACcounts,
|
||||
final int numChr,
|
||||
final ExactACset callingSet,
|
||||
final int PLsetIndex,
|
||||
final Queue<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
|
||||
private static void updateACset(final int[] ACcounts,
|
||||
final int numChr,
|
||||
final ExactACset callingSet,
|
||||
final int PLsetIndex,
|
||||
final Queue<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
|
||||
final ExactACcounts index = new ExactACcounts(ACcounts);
|
||||
boolean wasInQueue = true;
|
||||
if ( !indexesToACset.containsKey(index) ) {
|
||||
ExactACset set = new ExactACset(numChr/2 +1, index);
|
||||
indexesToACset.put(index, set);
|
||||
ACqueue.add(set);
|
||||
wasInQueue = false;
|
||||
}
|
||||
|
||||
// add the given dependency to the set
|
||||
//if ( DEBUG )
|
||||
// System.out.println(" *** adding dependency from " + index + " to " + callingSet.ACcounts);
|
||||
final ExactACset set = indexesToACset.get(index);
|
||||
set.ACsetIndexToPLIndex.put(callingSet.ACcounts, PLsetIndex);
|
||||
return wasInQueue ? null : set;
|
||||
}
|
||||
|
||||
private static ExactACset determineLastDependentSetInQueue(final ExactACcounts callingSetIndex, final Queue<ExactACset> ACqueue) {
|
||||
ExactACset set = null;
|
||||
for ( ExactACset queued : ACqueue ) {
|
||||
if ( queued.dependentACsetsToDelete.contains(callingSetIndex) )
|
||||
set = queued;
|
||||
private static ExactACset determineLastDependentSetInQueue(final ExactACcounts callingSetIndex, final LinkedList<ExactACset> ACqueue) {
|
||||
Iterator<ExactACset> reverseIterator = ACqueue.descendingIterator();
|
||||
while ( reverseIterator.hasNext() ) {
|
||||
final ExactACset queued = reverseIterator.next();
|
||||
if ( queued.ACsetIndexToPLIndex.containsKey(callingSetIndex) )
|
||||
return queued;
|
||||
}
|
||||
return set;
|
||||
|
||||
// shouldn't get here
|
||||
throw new ReviewedStingException("Error: no sets in the queue currently hold " + callingSetIndex + " as a dependent!");
|
||||
}
|
||||
|
||||
private static void computeLofK(final ExactACset set,
|
||||
|
|
@ -317,7 +337,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
||||
final double[][] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result,
|
||||
final double[][] tempLog10ConformationLikelihoods) {
|
||||
final double[][] tempLog10ConformationLikelihoods) {
|
||||
|
||||
set.log10Likelihoods[0] = 0.0; // the zero case
|
||||
final int totalK = set.getACsum();
|
||||
|
|
@ -329,40 +349,40 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
}
|
||||
// k > 0 for at least one k
|
||||
else {
|
||||
// deal with the non-AA possible conformations
|
||||
int conformationIndex = 1;
|
||||
for ( Map.Entry<ExactACcounts, Integer> mapping : set.ACsetIndexToPLIndex.entrySet() ) {
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** evaluating set=%s which depends on set=%s%n", set.ACcounts, mapping.getKey());
|
||||
// deal with the non-AA possible conformations
|
||||
int conformationIndex = 1;
|
||||
for ( Map.Entry<ExactACcounts, Integer> mapping : set.ACsetIndexToPLIndex.entrySet() ) {
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** evaluating set=%s which depends on set=%s%n", set.ACcounts, mapping.getKey());
|
||||
|
||||
ExactACset dependent = indexesToACset.get(mapping.getKey());
|
||||
ExactACset dependent = indexesToACset.get(mapping.getKey());
|
||||
|
||||
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
|
||||
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
|
||||
|
||||
if ( totalK <= 2*j ) { // skip impossible conformations
|
||||
final double[] gl = genotypeLikelihoods.get(j);
|
||||
tempLog10ConformationLikelihoods[j][conformationIndex] =
|
||||
determineCoefficient(mapping.getValue(), j, set.ACcounts.getCounts(), totalK) + dependent.log10Likelihoods[j-1] + gl[mapping.getValue()];
|
||||
if ( totalK <= 2*j ) { // skip impossible conformations
|
||||
final double[] gl = genotypeLikelihoods.get(j);
|
||||
tempLog10ConformationLikelihoods[j][conformationIndex] =
|
||||
determineCoefficient(mapping.getValue(), j, set.ACcounts.getCounts(), totalK) + dependent.log10Likelihoods[j-1] + gl[mapping.getValue()];
|
||||
} else {
|
||||
tempLog10ConformationLikelihoods[j][conformationIndex] = Double.NEGATIVE_INFINITY;
|
||||
}
|
||||
tempLog10ConformationLikelihoods[j][conformationIndex] = Double.NEGATIVE_INFINITY;
|
||||
}
|
||||
}
|
||||
|
||||
conformationIndex++;
|
||||
}
|
||||
conformationIndex++;
|
||||
}
|
||||
|
||||
// finally, deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
|
||||
// finally, deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
|
||||
final int numPaths = set.ACsetIndexToPLIndex.size() + 1;
|
||||
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
|
||||
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
|
||||
|
||||
if ( totalK < 2*j-1 ) {
|
||||
final double[] gl = genotypeLikelihoods.get(j);
|
||||
tempLog10ConformationLikelihoods[j][0] = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
|
||||
} else {
|
||||
tempLog10ConformationLikelihoods[j][0] = Double.NEGATIVE_INFINITY;
|
||||
}
|
||||
if ( totalK < 2*j-1 ) {
|
||||
final double[] gl = genotypeLikelihoods.get(j);
|
||||
tempLog10ConformationLikelihoods[j][0] = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
|
||||
} else {
|
||||
tempLog10ConformationLikelihoods[j][0] = Double.NEGATIVE_INFINITY;
|
||||
}
|
||||
|
||||
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
||||
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
||||
final double log10Max = MathUtils.approximateLog10SumLog10(tempLog10ConformationLikelihoods[j], numPaths);
|
||||
set.log10Likelihoods[j] = log10Max - logDenominator;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,114 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
|
||||
/**
|
||||
* Uses the UG engine to determine per-sample genotype likelihoods and emits them as a VCF (using PLs).
|
||||
* Absolutely not supported or recommended for public use.
|
||||
* Run this as you would the UnifiedGenotyper, except that you must additionally pass in a VCF bound to
|
||||
* the name 'allele' so we know which alternate allele to use at each site.
|
||||
*/
|
||||
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_INPUT)
|
||||
@Reference(window=@Window(start=-200,stop=200))
|
||||
@By(DataSource.READS)
|
||||
@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250)
|
||||
public class UGCalcLikelihoods extends LocusWalker<VariantCallContext, Integer> implements TreeReducible<Integer> {
|
||||
|
||||
@ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
|
||||
|
||||
// control the output
|
||||
@Output(doc="File to which variants should be written",required=true)
|
||||
protected VCFWriter writer = null;
|
||||
|
||||
// the calculation arguments
|
||||
private UnifiedGenotyperEngine UG_engine = null;
|
||||
|
||||
// enable deletions in the pileup
|
||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||
|
||||
// enable extended events for indels
|
||||
public boolean generateExtendedEvents() { return UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.SNP; }
|
||||
|
||||
public void initialize() {
|
||||
// get all of the unique sample names
|
||||
Set<String> samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||
|
||||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples);
|
||||
|
||||
// initialize the header
|
||||
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?"));
|
||||
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
|
||||
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)"));
|
||||
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
|
||||
|
||||
writer.writeHeader(new VCFHeader(headerInfo, samples)) ;
|
||||
}
|
||||
|
||||
public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
|
||||
VariantContext call = UG_engine.calculateLikelihoods(tracker, refContext, rawContext);
|
||||
return call == null ? null : new VariantCallContext(call, true);
|
||||
}
|
||||
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
public Integer treeReduce(Integer lhs, Integer rhs) {
|
||||
return lhs + rhs;
|
||||
}
|
||||
|
||||
public Integer reduce(VariantCallContext value, Integer sum) {
|
||||
if ( value == null )
|
||||
return sum;
|
||||
|
||||
try {
|
||||
writer.add(value);
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name");
|
||||
}
|
||||
|
||||
return sum + 1;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer sum) {
|
||||
logger.info(String.format("Visited bases: %d", sum));
|
||||
}
|
||||
}
|
||||
|
|
@ -1,152 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Uses the UG engine to call variants based off of VCFs annotated with GLs (or PLs).
|
||||
* Absolutely not supported or recommended for public use.
|
||||
* Run this as you would the UnifiedGenotyper, except that instead of '-I reads' it expects any number
|
||||
* of GL/PL-annotated VCFs bound to a name starting with 'variant'.
|
||||
*/
|
||||
public class UGCallVariants extends RodWalker<VariantCallContext, Integer> {
|
||||
|
||||
@ArgumentCollection
|
||||
private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
|
||||
|
||||
@Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true)
|
||||
public List<RodBinding<VariantContext>> variants;
|
||||
|
||||
// control the output
|
||||
@Output(doc="File to which variants should be written",required=true)
|
||||
protected VCFWriter writer = null;
|
||||
|
||||
// the calculation arguments
|
||||
private UnifiedGenotyperEngine UG_engine = null;
|
||||
|
||||
// variant track names
|
||||
private Set<String> trackNames = new HashSet<String>();
|
||||
|
||||
public void initialize() {
|
||||
|
||||
for ( RodBinding<VariantContext> rb : variants )
|
||||
trackNames.add(rb.getName());
|
||||
Set<String> samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), trackNames);
|
||||
|
||||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples);
|
||||
|
||||
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, -1, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed"));
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, -1, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed"));
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes"));
|
||||
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
|
||||
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality"));
|
||||
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)"));
|
||||
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
|
||||
if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING )
|
||||
headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality"));
|
||||
|
||||
// initialize the header
|
||||
writer.writeHeader(new VCFHeader(headerInfo, samples));
|
||||
}
|
||||
|
||||
public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null )
|
||||
return null;
|
||||
|
||||
List<VariantContext> VCs = tracker.getValues(variants, context.getLocation());
|
||||
|
||||
VariantContext mergedVC = mergeVCsWithGLs(VCs);
|
||||
if ( mergedVC == null )
|
||||
return null;
|
||||
|
||||
return UG_engine.calculateGenotypes(tracker, ref, context, mergedVC);
|
||||
}
|
||||
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
public Integer reduce(VariantCallContext value, Integer sum) {
|
||||
if ( value == null )
|
||||
return sum;
|
||||
|
||||
try {
|
||||
VariantContextBuilder builder = new VariantContextBuilder(value);
|
||||
VariantContextUtils.calculateChromosomeCounts(builder, true);
|
||||
writer.add(builder.make());
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name");
|
||||
}
|
||||
|
||||
return sum + 1;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer result) {
|
||||
logger.info(String.format("Visited sites: %d", result));
|
||||
}
|
||||
|
||||
private static VariantContext mergeVCsWithGLs(List<VariantContext> VCs) {
|
||||
// we can't use the VCUtils classes because our VCs can all be no-calls
|
||||
if ( VCs.size() == 0 )
|
||||
return null;
|
||||
|
||||
VariantContext variantVC = null;
|
||||
GenotypesContext genotypes = GenotypesContext.create();
|
||||
for ( VariantContext vc : VCs ) {
|
||||
if ( variantVC == null && vc.isVariant() )
|
||||
variantVC = vc;
|
||||
genotypes.addAll(getGenotypesWithGLs(vc.getGenotypes()));
|
||||
}
|
||||
|
||||
if ( variantVC == null ) {
|
||||
VariantContext vc = VCs.get(0);
|
||||
throw new UserException("There is no ALT allele in any of the VCF records passed in at " + vc.getChr() + ":" + vc.getStart());
|
||||
}
|
||||
|
||||
return new VariantContextBuilder(variantVC).source("VCwithGLs").genotypes(genotypes).make();
|
||||
}
|
||||
|
||||
private static GenotypesContext getGenotypesWithGLs(GenotypesContext genotypes) {
|
||||
GenotypesContext genotypesWithGLs = GenotypesContext.create(genotypes.size());
|
||||
for ( final Genotype g : genotypes ) {
|
||||
if ( g.hasLikelihoods() && g.getLikelihoods().getAsVector() != null )
|
||||
genotypesWithGLs.add(g);
|
||||
}
|
||||
return genotypesWithGLs;
|
||||
}
|
||||
}
|
||||
|
|
@ -126,10 +126,10 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
@ArgumentCollection
|
||||
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
|
||||
public RodBinding<VariantContext> getDbsnpRodBinding() { return dbsnp.dbsnp; }
|
||||
public RodBinding<VariantContext> getVariantRodBinding() { return null; }
|
||||
public RodBinding<VariantContext> getSnpEffRodBinding() { return null; }
|
||||
public List<RodBinding<VariantContext>> getCompRodBindings() { return Collections.emptyList(); }
|
||||
public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); }
|
||||
public boolean alwaysAppendDbsnpId() { return false; }
|
||||
|
||||
/**
|
||||
* A raw, unfiltered, highly specific callset in VCF format.
|
||||
|
|
@ -205,6 +205,12 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
*
|
||||
**/
|
||||
public void initialize() {
|
||||
// warn the user for misusing EMIT_ALL_SITES
|
||||
if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES &&
|
||||
UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY &&
|
||||
UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.SNP )
|
||||
logger.warn("WARNING: note that the EMIT_ALL_SITES option is intended only for point mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by no means produce a comprehensive set of indels in DISCOVERY mode");
|
||||
|
||||
// get all of the unique sample names
|
||||
Set<String> samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||
|
||||
|
|
|
|||
|
|
@ -54,8 +54,9 @@ public class UnifiedGenotyperEngine {
|
|||
EMIT_VARIANTS_ONLY,
|
||||
/** produces calls at variant sites and confident reference sites */
|
||||
EMIT_ALL_CONFIDENT_SITES,
|
||||
/** produces calls at any callable site regardless of confidence; this argument is intended for point
|
||||
* mutations (SNPs) only and while some indel calls may be produced they are by no means comprehensive */
|
||||
/** produces calls at any callable site regardless of confidence; this argument is intended only for point
|
||||
* mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by
|
||||
* no means produce a comprehensive set of indels in DISCOVERY mode */
|
||||
EMIT_ALL_SITES
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,233 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
@Analysis(description = "Evaluation summary for multi-allelic variants")
|
||||
public class MultiallelicSummary extends VariantEvaluator { // implements StandardEval {
|
||||
final protected static Logger logger = Logger.getLogger(MultiallelicSummary.class);
|
||||
|
||||
public enum Type {
|
||||
SNP, INDEL
|
||||
}
|
||||
|
||||
// basic counts on various rates found
|
||||
@DataPoint(description = "Number of processed loci")
|
||||
public long nProcessedLoci = 0;
|
||||
|
||||
@DataPoint(description = "Number of SNPs")
|
||||
public int nSNPs = 0;
|
||||
@DataPoint(description = "Number of multi-allelic SNPs")
|
||||
public int nMultiSNPs = 0;
|
||||
@DataPoint(description = "% processed sites that are multi-allelic SNPs", format = "%.5f")
|
||||
public double processedMultiSnpRatio = 0;
|
||||
@DataPoint(description = "% SNP sites that are multi-allelic", format = "%.3f")
|
||||
public double variantMultiSnpRatio = 0;
|
||||
|
||||
@DataPoint(description = "Number of Indels")
|
||||
public int nIndels = 0;
|
||||
@DataPoint(description = "Number of multi-allelic Indels")
|
||||
public int nMultiIndels = 0;
|
||||
@DataPoint(description = "% processed sites that are multi-allelic Indels", format = "%.5f")
|
||||
public double processedMultiIndelRatio = 0;
|
||||
@DataPoint(description = "% Indel sites that are multi-allelic", format = "%.3f")
|
||||
public double variantMultiIndelRatio = 0;
|
||||
|
||||
@DataPoint(description = "Number of Transitions")
|
||||
public int nTi = 0;
|
||||
@DataPoint(description = "Number of Transversions")
|
||||
public int nTv = 0;
|
||||
@DataPoint(description = "Overall TiTv ratio", format = "%.2f")
|
||||
public double TiTvRatio = 0;
|
||||
|
||||
@DataPoint(description = "Multi-allelic SNPs partially known")
|
||||
public int knownSNPsPartial = 0;
|
||||
@DataPoint(description = "Multi-allelic SNPs completely known")
|
||||
public int knownSNPsComplete = 0;
|
||||
@DataPoint(description = "Multi-allelic SNP Novelty Rate")
|
||||
public String SNPNoveltyRate = "NA";
|
||||
|
||||
@DataPoint(description = "Multi-allelic Indels partially known")
|
||||
public int knownIndelsPartial = 0;
|
||||
@DataPoint(description = "Multi-allelic Indels completely known")
|
||||
public int knownIndelsComplete = 0;
|
||||
@DataPoint(description = "Multi-allelic Indel Novelty Rate")
|
||||
public String indelNoveltyRate = "NA";
|
||||
|
||||
@DataPoint(description="Histogram of allele frequencies")
|
||||
AFHistogram AFhistogram = new AFHistogram();
|
||||
|
||||
/*
|
||||
* AF histogram table object
|
||||
*/
|
||||
static class AFHistogram implements TableType {
|
||||
private Object[] colKeys, rowKeys = {"pairwise_AF"};
|
||||
private int[] AFhistogram;
|
||||
|
||||
private static final double AFincrement = 0.01;
|
||||
private static final int numBins = (int)(1.00 / AFincrement);
|
||||
|
||||
public AFHistogram() {
|
||||
colKeys = initColKeys();
|
||||
AFhistogram = new int[colKeys.length];
|
||||
}
|
||||
|
||||
public Object[] getColumnKeys() {
|
||||
return colKeys;
|
||||
}
|
||||
|
||||
public Object[] getRowKeys() {
|
||||
return rowKeys;
|
||||
}
|
||||
|
||||
public Object getCell(int row, int col) {
|
||||
return AFhistogram[col];
|
||||
}
|
||||
|
||||
private static Object[] initColKeys() {
|
||||
ArrayList<String> keyList = new ArrayList<String>(numBins + 1);
|
||||
for ( double a = 0.00; a <= 1.01; a += AFincrement ) {
|
||||
keyList.add(String.format("%.2f", a));
|
||||
}
|
||||
return keyList.toArray();
|
||||
}
|
||||
|
||||
public String getName() { return "AFHistTable"; }
|
||||
|
||||
public void update(VariantContext vc) {
|
||||
final Object obj = vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY, null);
|
||||
if ( obj == null || !(obj instanceof List) )
|
||||
return;
|
||||
|
||||
List<String> list = (List<String>)obj;
|
||||
for ( String str : list ) {
|
||||
final double AF = Double.valueOf(str);
|
||||
final int bin = (int)(numBins * MathUtils.round(AF, 2));
|
||||
AFhistogram[bin]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void initialize(VariantEvalWalker walker) {}
|
||||
|
||||
@Override public boolean enabled() { return true; }
|
||||
|
||||
public int getComparisonOrder() {
|
||||
return 2;
|
||||
}
|
||||
|
||||
public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1);
|
||||
}
|
||||
|
||||
public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( eval == null || eval.isMonomorphicInSamples() )
|
||||
return null;
|
||||
|
||||
// update counts
|
||||
switch ( eval.getType() ) {
|
||||
case SNP:
|
||||
nSNPs++;
|
||||
if ( !eval.isBiallelic() ) {
|
||||
nMultiSNPs++;
|
||||
calculatePairwiseTiTv(eval);
|
||||
calculateSNPPairwiseNovelty(eval, comp);
|
||||
}
|
||||
break;
|
||||
case INDEL:
|
||||
nIndels++;
|
||||
if ( !eval.isBiallelic() ) {
|
||||
nMultiIndels++;
|
||||
calculateIndelPairwiseNovelty(eval, comp);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new UserException.BadInput("Unexpected variant context type: " + eval);
|
||||
}
|
||||
AFhistogram.update(eval);
|
||||
|
||||
return null; // we don't capture any interesting sites
|
||||
}
|
||||
|
||||
private void calculatePairwiseTiTv(VariantContext vc) {
|
||||
for ( Allele alt : vc.getAlternateAlleles() ) {
|
||||
if ( VariantContextUtils.isTransition(vc.getReference(), alt) )
|
||||
nTi++;
|
||||
else
|
||||
nTv++;
|
||||
}
|
||||
}
|
||||
|
||||
private void calculateSNPPairwiseNovelty(VariantContext eval, VariantContext comp) {
|
||||
if ( comp == null )
|
||||
return;
|
||||
|
||||
int knownAlleles = 0;
|
||||
for ( Allele alt : eval.getAlternateAlleles() ) {
|
||||
if ( comp.getAlternateAlleles().contains(alt) )
|
||||
knownAlleles++;
|
||||
}
|
||||
|
||||
if ( knownAlleles == eval.getAlternateAlleles().size() )
|
||||
knownSNPsComplete++;
|
||||
else if ( knownAlleles > 0 )
|
||||
knownSNPsPartial++;
|
||||
}
|
||||
|
||||
private void calculateIndelPairwiseNovelty(VariantContext eval, VariantContext comp) {
|
||||
}
|
||||
|
||||
private final String noveltyRate(final int all, final int known) {
|
||||
final int novel = all - known;
|
||||
final double rate = (novel / (1.0 * all));
|
||||
return all == 0 ? "NA" : String.format("%.2f", rate);
|
||||
}
|
||||
|
||||
public void finalizeEvaluation() {
|
||||
processedMultiSnpRatio = (double)nMultiSNPs / (double)nProcessedLoci;
|
||||
variantMultiSnpRatio = (double)nMultiSNPs / (double)nSNPs;
|
||||
processedMultiIndelRatio = (double)nMultiIndels / (double)nProcessedLoci;
|
||||
variantMultiIndelRatio = (double)nMultiIndels / (double)nIndels;
|
||||
|
||||
TiTvRatio = (double)nTi / (double)nTv;
|
||||
|
||||
SNPNoveltyRate = noveltyRate(nMultiSNPs, knownSNPsPartial + knownSNPsComplete);
|
||||
indelNoveltyRate = noveltyRate(nMultiSNPs, knownIndelsPartial + knownIndelsComplete);
|
||||
}
|
||||
}
|
||||
|
|
@ -120,6 +120,10 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
@Argument(shortName="filteredRecordsMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields", required=false)
|
||||
public VariantContextUtils.FilteredRecordMergeType filteredRecordsMergeType = VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED;
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName="multipleAllelesMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different allele types (for example, SNP vs. indel)", required=false)
|
||||
public VariantContextUtils.MultipleAllelesMergeType multipleAllelesMergeType = VariantContextUtils.MultipleAllelesMergeType.BY_TYPE;
|
||||
|
||||
/**
|
||||
* Used when taking the union of variants that contain genotypes. A complete priority list MUST be provided.
|
||||
*/
|
||||
|
|
@ -236,13 +240,24 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
return 0;
|
||||
|
||||
List<VariantContext> mergedVCs = new ArrayList<VariantContext>();
|
||||
Map<VariantContext.Type, List<VariantContext>> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs);
|
||||
// iterate over the types so that it's deterministic
|
||||
for ( VariantContext.Type type : VariantContext.Type.values() ) {
|
||||
if ( VCsByType.containsKey(type) )
|
||||
mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type),
|
||||
priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
|
||||
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
|
||||
|
||||
if (multipleAllelesMergeType == VariantContextUtils.MultipleAllelesMergeType.BY_TYPE) {
|
||||
Map<VariantContext.Type, List<VariantContext>> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs);
|
||||
// iterate over the types so that it's deterministic
|
||||
for (VariantContext.Type type : VariantContext.Type.values()) {
|
||||
if (VCsByType.containsKey(type))
|
||||
mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type),
|
||||
priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
|
||||
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
|
||||
}
|
||||
}
|
||||
else if (multipleAllelesMergeType == VariantContextUtils.MultipleAllelesMergeType.MIX_TYPES) {
|
||||
mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), vcs,
|
||||
priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
|
||||
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
|
||||
}
|
||||
else {
|
||||
logger.warn("Ignoring all records at site " + ref.getLocus());
|
||||
}
|
||||
|
||||
for ( VariantContext mergedVC : mergedVCs ) {
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.codecs.vcf;
|
||||
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
|
@ -149,7 +150,11 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF
|
|||
count = Integer.valueOf(numberStr);
|
||||
|
||||
}
|
||||
type = VCFHeaderLineType.valueOf(mapping.get("Type"));
|
||||
try {
|
||||
type = VCFHeaderLineType.valueOf(mapping.get("Type"));
|
||||
} catch (Exception e) {
|
||||
throw new TribbleException(mapping.get("Type") + " is not a valid type in the VCF specification (note that types are case-sensitive)");
|
||||
}
|
||||
if (type == VCFHeaderLineType.Flag && !allowFlagValues())
|
||||
throw new IllegalArgumentException("Flag is an unsupported type for this kind of field");
|
||||
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@ import org.apache.commons.jexl2.Expression;
|
|||
import org.apache.commons.jexl2.JexlEngine;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.util.popgen.HardyWeinbergCalculation;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
|
@ -471,6 +472,18 @@ public class VariantContextUtils {
|
|||
KEEP_UNCONDITIONAL
|
||||
}
|
||||
|
||||
@Hidden
|
||||
public enum MultipleAllelesMergeType {
|
||||
/**
|
||||
* Combine only alleles of the same type (SNP, indel, etc.) into a single VCF record.
|
||||
*/
|
||||
BY_TYPE,
|
||||
/**
|
||||
* Merge all allele types at the same start position into the same VCF record.
|
||||
*/
|
||||
MIX_TYPES
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided.
|
||||
* If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with
|
||||
|
|
@ -1060,6 +1073,14 @@ public class VariantContextUtils {
|
|||
return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSVERSION;
|
||||
}
|
||||
|
||||
public static boolean isTransition(Allele ref, Allele alt) {
|
||||
return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSITION;
|
||||
}
|
||||
|
||||
public static boolean isTransversion(Allele ref, Allele alt) {
|
||||
return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSVERSION;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a genome location, given a variant context
|
||||
* @param genomeLocParser parser
|
||||
|
|
|
|||
|
|
@ -110,6 +110,14 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
|
|||
executeTest("getting DB tag with dbSNP", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultipleIdsWithDbsnp() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " --alwaysAppendDbsnpId --dbsnp " + b36dbSNP129 + " -G Standard --variant " + validationDataLocation + "vcfexample3withIDs.vcf -L " + validationDataLocation + "vcfexample3withIDs.vcf", 1,
|
||||
Arrays.asList("cd7e3d43b8f5579c461b3e588a295fa8"));
|
||||
executeTest("adding multiple IDs with dbSNP", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDBTagWithHapMap() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
|
|
|
|||
|
|
@ -294,7 +294,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation +
|
||||
"phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1,
|
||||
Arrays.asList("877de5b0cc61dc54636062df6399b978"));
|
||||
Arrays.asList("1d1956fd7b0f0d30935674b2f5019860"));
|
||||
executeTest("test MultiSample Phase1 indels with complicated records", spec4);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue