Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Guillermo del Angel 2011-09-13 17:04:43 -04:00
commit 5b1bf6e244
27 changed files with 221 additions and 308 deletions

View File

@ -379,7 +379,7 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
}
if ( tribbleType == null )
if ( ! file.canRead() | !! file.isFile() ) {
if ( ! file.canRead() | ! file.isFile() ) {
throw new UserException.BadArgumentValue(name, "Couldn't read file to determine type: " + file);
} else {
throw new UserException.CommandLineException(

View File

@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.GATKBAMFileSpan;
import net.sf.samtools.GATKChunk;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
@ -84,7 +85,7 @@ public class BAMScheduler implements Iterator<FilePointer> {
if(currentLocus == GenomeLoc.UNMAPPED) {
nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED);
for(SAMReaderID id: dataSource.getReaderIDs())
nextFilePointer.addFileSpans(id,new GATKBAMFileSpan());
nextFilePointer.addFileSpans(id,new GATKBAMFileSpan(new GATKChunk(indexFiles.get(id).getStartOfLastLinearBin(),Long.MAX_VALUE)));
currentLocus = null;
continue;
}

View File

@ -215,6 +215,45 @@ public class GATKBAMIndex {
return (new GATKBin(bin).getBinNumber()-levelStart+1)*(BIN_GENOMIC_SPAN /levelSize);
}
/**
* Use to get close to the unmapped reads at the end of a BAM file.
* @return The file offset of the first record in the last linear bin, or -1
* if there are no elements in linear bins (i.e. no mapped reads).
*/
public long getStartOfLastLinearBin() {
openIndexFile();
seek(4);
final int sequenceCount = readInteger();
// Because no reads may align to the last sequence in the sequence dictionary,
// grab the last element of the linear index for each sequence, and return
// the last one from the last sequence that has one.
long lastLinearIndexPointer = -1;
for (int i = 0; i < sequenceCount; i++) {
// System.out.println("# Sequence TID: " + i);
final int nBins = readInteger();
// System.out.println("# nBins: " + nBins);
for (int j1 = 0; j1 < nBins; j1++) {
// Skip bin #
skipBytes(4);
final int nChunks = readInteger();
// Skip chunks
skipBytes(16 * nChunks);
}
final int nLinearBins = readInteger();
if (nLinearBins > 0) {
// Skip to last element of list of linear bins
skipBytes(8 * (nLinearBins - 1));
lastLinearIndexPointer = readLongs(1)[0];
}
}
closeIndexFile();
return lastLinearIndexPointer;
}
/**
* Gets the possible number of bins for a given reference sequence.
* @return How many bins could possibly be used according to this indexing scheme to index a single contig.

View File

@ -134,24 +134,11 @@ public class ReadShardStrategy implements ShardStrategy {
Map<SAMReaderID,SAMFileSpan> selectedReaders = new HashMap<SAMReaderID,SAMFileSpan>();
while(selectedReaders.size() == 0 && currentFilePointer != null) {
shardPosition = currentFilePointer.fileSpans;
for(SAMReaderID id: shardPosition.keySet()) {
// If the region contains location information (in other words, it is not at
// the start of the unmapped region), add the region.
if(currentFilePointer.isRegionUnmapped) {
// If the region is unmapped and no location data exists, add a null as an indicator to
// start at the next unmapped region.
if(!isIntoUnmappedRegion) {
selectedReaders.put(id,null);
isIntoUnmappedRegion = true;
}
else
selectedReaders.put(id,position.get(id));
}
else {
SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
if(!fileSpan.isEmpty())
selectedReaders.put(id,fileSpan);
}
SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
if(!fileSpan.isEmpty())
selectedReaders.put(id,fileSpan);
}
if(selectedReaders.size() > 0) {

View File

@ -86,6 +86,7 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
public RodBinding<VariantContext> getVariantRodBinding() { return variantCollection.variants; }
/**
* The INFO field will be annotated with information on the most biologically-significant effect
@ -208,6 +209,8 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, this);
engine.initializeExpressions(expressionsToUse);
engine.invokeAnnotationInitializationMethods();
// setup the header fields
// note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();

View File

@ -29,10 +29,7 @@ import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationInterfaceManager;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
@ -113,6 +110,16 @@ public class VariantAnnotatorEngine {
dbAnnotations.put(rod, rod.getName());
}
public void invokeAnnotationInitializationMethods() {
for ( VariantAnnotatorAnnotation annotation : requestedInfoAnnotations ) {
annotation.initialize(walker);
}
for ( VariantAnnotatorAnnotation annotation : requestedGenotypeAnnotations ) {
annotation.initialize(walker);
}
}
public Set<VCFHeaderLine> getVCFAnnotationDescriptions() {
Set<VCFHeaderLine> descriptions = new HashSet<VCFHeaderLine>();

View File

@ -9,6 +9,7 @@ import java.util.List;
public interface AnnotatorCompatibleWalker {
// getter methods for various used bindings
public abstract RodBinding<VariantContext> getVariantRodBinding();
public abstract RodBinding<SnpEffFeature> getSnpEffRodBinding();
public abstract RodBinding<VariantContext> getDbsnpRodBinding();
public abstract List<RodBinding<VariantContext>> getCompRodBindings();

View File

@ -24,18 +24,15 @@
package org.broadinstitute.sting.gatk.walkers.annotator.interfaces;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.List;
import java.util.Map;
@DocumentedGATKFeature(enable = true, groupName = "VariantAnnotator annotations", summary = "VariantAnnotator annotations")
public abstract class VariantAnnotatorAnnotation {
// return the INFO keys
public abstract List<String> getKeyNames();
// initialization method (optional for subclasses, and therefore non-abstract)
public void initialize ( AnnotatorCompatibleWalker walker ) { }
}

View File

@ -127,6 +127,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
public RodBinding<VariantContext> getDbsnpRodBinding() { return dbsnp.dbsnp; }
public RodBinding<VariantContext> getVariantRodBinding() { return null; }
public RodBinding<SnpEffFeature> getSnpEffRodBinding() { return null; }
public List<RodBinding<VariantContext>> getCompRodBindings() { return Collections.emptyList(); }
public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); }

View File

@ -55,7 +55,7 @@ import java.util.*;
*
* <h2>Output</h2>
* <p>
* Evaluation tables.
* Evaluation tables detailing the results of the eval modules which were applied.
* </p>
*
* <h2>Examples</h2>
@ -152,6 +152,9 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
@Argument(fullName="ancestralAlignments", shortName="aa", doc="Fasta file with ancestral alleles", required=false)
private File ancestralAlignmentsFile = null;
@Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp and eval tracks with exactly matching reference and alternate alleles will be counted as overlapping", required=false)
private boolean requireStrictAlleleMatch = false;
// Variables
private Set<SortableJexlVCMatchExp> jexlExpressions = new TreeSet<SortableJexlVCMatchExp>();
@ -360,16 +363,16 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
if ( matchingComps.size() == 0 )
return null;
// find the comp which matches the alternate allele from eval
// find the comp which matches both the reference allele and alternate allele from eval
Allele altEval = eval.getAlternateAlleles().size() == 0 ? null : eval.getAlternateAllele(0);
for ( VariantContext comp : matchingComps ) {
Allele altComp = comp.getAlternateAlleles().size() == 0 ? null : comp.getAlternateAllele(0);
if ( (altEval == null && altComp == null) || (altEval != null && altEval.equals(altComp)) )
if ( (altEval == null && altComp == null) || (altEval != null && altEval.equals(altComp) && eval.getReference().equals(comp.getReference())) )
return comp;
}
// if none match, just return the first one
return matchingComps.get(0);
// if none match, just return the first one unless we require a strict match
return (requireStrictAlleleMatch ? null : matchingComps.get(0));
}
public Integer treeReduce(Integer lhs, Integer rhs) { return null; }

View File

@ -0,0 +1,76 @@
/*
* Copyright (c) 2011 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: 3/12/11
*/
public class TrainingSet {
public RodBinding<VariantContext> rodBinding;
public boolean isKnown = false;
public boolean isTraining = false;
public boolean isAntiTraining = false;
public boolean isTruth = false;
public boolean isConsensus = false;
public double prior = 0.0;
protected final static Logger logger = Logger.getLogger(TrainingSet.class);
public TrainingSet( final RodBinding<VariantContext> rodBinding) {
this.rodBinding = rodBinding;
final Tags tags = rodBinding.getTags();
final String name = rodBinding.getName();
// Parse the tags to decide which tracks have which properties
if( tags != null ) {
isKnown = tags.containsKey("known") && tags.getValue("known").equals("true");
isTraining = tags.containsKey("training") && tags.getValue("training").equals("true");
isAntiTraining = tags.containsKey("bad") && tags.getValue("bad").equals("true");
isTruth = tags.containsKey("truth") && tags.getValue("truth").equals("true");
isConsensus = tags.containsKey("consensus") && tags.getValue("consensus").equals("true");
prior = ( tags.containsKey("prior") ? Double.parseDouble(tags.getValue("prior")) : prior );
}
// Report back to the user which tracks were found and the properties that were detected
if( !isConsensus && !isAntiTraining ) {
logger.info( String.format( "Found %s track: \tKnown = %s \tTraining = %s \tTruth = %s \tPrior = Q%.1f", name, isKnown, isTraining, isTruth, prior) );
} else if( isConsensus ) {
logger.info( String.format( "Found consensus track: %s", name) );
} else {
logger.info( String.format( "Found bad sites training track: %s", name) );
}
}
}

View File

@ -51,10 +51,10 @@ public class VariantDataManager {
private ExpandingArrayList<VariantDatum> data;
private final double[] meanVector;
private final double[] varianceVector; // this is really the standard deviation
public final ArrayList<String> annotationKeys;
public final List<String> annotationKeys;
private final VariantRecalibratorArgumentCollection VRAC;
protected final static Logger logger = Logger.getLogger(VariantDataManager.class);
protected final List<TrainingSet> trainingSets;
public VariantDataManager( final List<String> annotationKeys, final VariantRecalibratorArgumentCollection VRAC ) {
this.data = null;
@ -62,6 +62,7 @@ public class VariantDataManager {
this.VRAC = VRAC;
meanVector = new double[this.annotationKeys.size()];
varianceVector = new double[this.annotationKeys.size()];
trainingSets = new ArrayList<TrainingSet>();
}
public void setData( final ExpandingArrayList<VariantDatum> data ) {
@ -104,6 +105,31 @@ public class VariantDataManager {
}
}
public void addTrainingSet( final TrainingSet trainingSet ) {
trainingSets.add( trainingSet );
}
public boolean checkHasTrainingSet() {
for( final TrainingSet trainingSet : trainingSets ) {
if( trainingSet.isTraining ) { return true; }
}
return false;
}
public boolean checkHasTruthSet() {
for( final TrainingSet trainingSet : trainingSets ) {
if( trainingSet.isTruth ) { return true; }
}
return false;
}
public boolean checkHasKnownSet() {
for( final TrainingSet trainingSet : trainingSets ) {
if( trainingSet.isKnown ) { return true; }
}
return false;
}
public ExpandingArrayList<VariantDatum> getTrainingData() {
final ExpandingArrayList<VariantDatum> trainingData = new ExpandingArrayList<VariantDatum>();
for( final VariantDatum datum : data ) {
@ -232,57 +258,35 @@ public class VariantDataManager {
return value;
}
public void parseTrainingSets( final RefMetaDataTracker tracker, final GenomeLoc genomeLoc, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC, final HashMap<String, Double> rodToPriorMap,
final List<RodBinding<VariantContext>> training, final List<RodBinding<VariantContext>> truth, final List<RodBinding<VariantContext>> known, final List<RodBinding<VariantContext>> badSites, final List<RodBinding<VariantContext>> resource) {
public void parseTrainingSets( final RefMetaDataTracker tracker, final GenomeLoc genomeLoc, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC ) {
datum.isKnown = false;
datum.atTruthSite = false;
datum.atTrainingSite = false;
datum.atAntiTrainingSite = false;
datum.prior = 2.0;
//BUGBUG: need to clean this up
for( final RodBinding<VariantContext> rod : training ) {
for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
for( final TrainingSet trainingSet : trainingSets ) {
for( final VariantContext trainVC : tracker.getValues(trainingSet.rodBinding, genomeLoc) ) {
if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
datum.atTrainingSite = true;
datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
datum.isKnown = datum.isKnown || trainingSet.isKnown;
datum.atTruthSite = datum.atTruthSite || trainingSet.isTruth;
datum.atTrainingSite = datum.atTrainingSite || trainingSet.isTraining;
datum.prior = Math.max( datum.prior, trainingSet.prior );
datum.consensusCount += ( trainingSet.isConsensus ? 1 : 0 );
}
}
}
for( final RodBinding<VariantContext> rod : truth ) {
for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
datum.atTruthSite = true;
datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
}
}
}
for( final RodBinding<VariantContext> rod : known ) {
for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
datum.isKnown = true;
datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
}
}
}
for( final RodBinding<VariantContext> rod : resource ) {
for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
}
}
}
for( final RodBinding<VariantContext> rod : badSites ) {
for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
if( trainVC != null ) {
datum.atAntiTrainingSite = true;
datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
datum.atAntiTrainingSite = datum.atAntiTrainingSite || trainingSet.isAntiTraining;
}
}
}
}
private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) {
return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() &&
((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) &&
(TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic());
}
public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) {
for( final VariantDatum datum : data ) {
RECAL_FILE.println(String.format("%s,%d,%d,%.4f,%s",
@ -290,10 +294,4 @@ public class VariantDataManager {
(datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")));
}
}
private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) {
return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() &&
((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) &&
(TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic());
}
}

View File

@ -77,16 +77,15 @@ import java.util.*;
* <p>
* A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data.
*
* <h2>Examples</h2>
* <h2>Example</h2>
* <pre>
* java -Xmx4g -jar GenomeAnalysisTK.jar \
* -T VariantRecalibrator \
* -R reference/human_g1k_v37.fasta \
* -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \
* -truth:prior=15.0 hapmap_3.3.b37.sites.vcf \
* -training:prior=15.0 hapmap_3.3.b37.sites.vcf \
* -training:prior=12.0 1000G_omni2.5.b37.sites.vcf \
* -known:prior=8.0 dbsnp_132.b37.vcf \
* -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \
* -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \
* -resource:dbsnp,known=true,training=false,truth=false,prior=8.0 dbsnp_132.b37.vcf \
* -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ \
* -recalFile path/to/output.recal \
* -tranchesFile path/to/output.tranches \
@ -112,34 +111,11 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
public List<RodBinding<VariantContext>> input;
/**
* Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
*/
@Input(fullName="training", shortName = "training", doc="A list of training variants used to train the Gaussian mixture model", required=true)
public List<RodBinding<VariantContext>> training;
/**
* When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
* Typically one might want to say I dropped my threshold until I got back 99% of HapMap sites, for example.
*/
@Input(fullName="truth", shortName = "truth", doc="A list of true variants to be used when deciding the truth sensitivity cut of the final callset", required=true)
public List<RodBinding<VariantContext>> truth;
/**
* The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
* The output metrics are stratified by known status in order to aid in comparisons with other call sets.
*/
@Input(fullName="known", shortName = "known", doc="A list of known variants to be used for metric comparison purposes", required=false)
public List<RodBinding<VariantContext>> known = Collections.emptyList();
/**
* In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list
* with a database of known bad variants. Maybe these are loci which are frequently filtered out in many projects (centromere, for example).
*/
@Input(fullName="badSites", shortName = "badSites", doc="A list of known bad variants used to supplement training the negative model", required=false)
public List<RodBinding<VariantContext>> badSites = Collections.emptyList();
/**
* Any set of sites for which you would like to apply a prior probability but for which you don't want to use as training, truth, or known sites.
* Any set of VCF files to use as lists of training, truth, or known sites.
* Training - Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
* Truth - When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
* Known - The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
* Bad - In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list with a database of known bad variants.
*/
@Input(fullName="resource", shortName = "resource", doc="A list of sites for which to apply a prior probability of being correct but which aren't used by the algorithm", required=false)
public List<RodBinding<VariantContext>> resource = Collections.emptyList();
@ -205,7 +181,6 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
private PrintStream tranchesStream;
private final Set<String> ignoreInputFilterSet = new TreeSet<String>();
private final VariantRecalibratorEngine engine = new VariantRecalibratorEngine( VRAC );
private final HashMap<String, Double> rodToPriorMap = new HashMap<String, Double>();
//---------------------------------------------------------------------------------------------------------------
//
@ -227,18 +202,15 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
throw new UserException.CouldNotCreateOutputFile(TRANCHES_FILE, e);
}
final ArrayList<RodBinding<VariantContext>> allInputBindings = new ArrayList<RodBinding<VariantContext>>();
allInputBindings.addAll(truth);
allInputBindings.addAll(training);
allInputBindings.addAll(known);
allInputBindings.addAll(badSites);
allInputBindings.addAll(resource);
for( final RodBinding<VariantContext> rod : allInputBindings ) {
try {
rodToPriorMap.put(rod.getName(), (rod.getTags().containsKey("prior") ? Double.parseDouble(rod.getTags().getValue("prior")) : 0.0) );
} catch( NumberFormatException e ) {
throw new UserException.BadInput("Bad rod binding syntax. Prior key-value tag detected but isn't parsable. Expecting something like -training:prior=12.0 my.set.vcf");
}
for( RodBinding<VariantContext> rod : resource ) {
dataManager.addTrainingSet( new TrainingSet( rod ) );
}
if( !dataManager.checkHasTrainingSet() ) {
throw new UserException.CommandLineException( "No training set found! Please provide sets of known polymorphic loci marked with the training=true ROD binding tag. For example, -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" );
}
if( !dataManager.checkHasTruthSet() ) {
throw new UserException.CommandLineException( "No truth set found! Please provide sets of known polymorphic loci marked with the truth=true ROD binding tag. For example, -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" );
}
}
@ -270,7 +242,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
datum.isTransition = datum.isSNP && VariantContextUtils.isTransition(vc);
// Loop through the training data sets and if they overlap this loci then update the prior and training status appropriately
dataManager.parseTrainingSets( tracker, context.getLocation(), vc, datum, TRUST_ALL_POLYMORPHIC, rodToPriorMap, training, truth, known, badSites, resource ); // BUGBUG: need to clean this up to be a class, not a list of all the rod bindings
dataManager.parseTrainingSets( tracker, context.getLocation(), vc, datum, TRUST_ALL_POLYMORPHIC );
double priorFactor = QualityUtils.qualToProb( datum.prior );
//if( PERFORM_PROJECT_CONSENSUS ) { // BUGBUG: need to resurrect this functionality?
// final double consensusPrior = QualityUtils.qualToProb( 1.0 + 5.0 * datum.consensusCount );

View File

@ -41,11 +41,9 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
//System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile);
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b37KGReference +
" -known:prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" +
" -training:prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" +
" -truth:prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" +
" -training:prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" +
" -truth:prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" +
" -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" +
" -resource:truth=true,training=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" +
" -resource:training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" +
" -T VariantRecalibrator" +
" -input " + params.inVCF +
" -L 20:1,000,000-40,000,000" +
@ -89,9 +87,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
//System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile);
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b37KGReference +
" -known:prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" +
" -training:prior=15.0 " + comparisonDataLocation + "Validated/Mills_Devine_Indels_2011/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.vcf" +
" -truth:prior=15.0 " + comparisonDataLocation + "Validated/Mills_Devine_Indels_2011/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.vcf" +
" -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" +
" -resource:training=true,truth=true,prior=15.0 " + comparisonDataLocation + "Validated/Mills_Devine_Indels_2011/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.vcf" +
" -T VariantRecalibrator" +
" -input " + params.inVCF +
" -L 20:1,000,000-40,000,000" +

View File

@ -6,10 +6,7 @@
<dependencies>
<!-- Recalibration analysis script -->
<class name="org.broadinstitute.sting.analyzecovariates.AnalyzeCovariates" />
<class name="org.broadinstitute.sting.gatk.walkers.recalibration.DinucCovariate" />
<class name="org.broadinstitute.sting.gatk.walkers.recalibration.QualityScoreCovariate" />
<class name="org.broadinstitute.sting.gatk.walkers.recalibration.CycleCovariate" />
<class name="org.broadinstitute.sting.gatk.walkers.recalibration.ReadGroupCovariate" />
<class name="org.broadinstitute.sting.gatk.walkers.recalibration.*" />
</dependencies>
</executable>
<resources>

View File

@ -1,10 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<package name="FindContaminatingReadGroups">
<executable name="FindContaminatingReadGroups">
<main-class name="org.broadinstitute.sting.gatk.CommandLineGATK" />
<resource-bundle file="StingText.properties" />
<dependencies>
<class name="org.broadinstitute.sting.playground.gatk.walkers.contamination.FindContaminatingReadGroupsWalker" />
</dependencies>
</executable>
</package>

View File

@ -1,20 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<package name="GATKResources">
<resources>
<file name="/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" />
<file name="/humgen/gsa-scr1/GATK_Data/dbsnp_129_b37.rod" />
<file name="/humgen/gsa-scr1/GATK_Data/dbsnp_130_b36.rod" />
<file name="/humgen/1kg/reference/human_b36_both.fasta" />
<file name="/humgen/1kg/reference/human_b36_both.dict" />
<file name="/humgen/1kg/reference/human_b36_both.fasta.fai" />
<file name="/humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod" />
<file name="/humgen/gsa-scr1/GATK_Data/dbsnp_130_hg18.rod" />
<file name="/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta" />
<file name="/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.dict" />
<file name="/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.fai" />
<file name="/humgen/gsa-scr1/GATK_Data/dbsnp_130_b37.rod" />
<file name="/humgen/1kg/reference/human_g1k_v37.fasta" />
<file name="/humgen/1kg/reference/human_g1k_v37.dict" />
<file name="/humgen/1kg/reference/human_g1k_v37.fasta.fai" />
</resources>
</package>

View File

@ -1,11 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<package name="IndelGenotyper">
<executable name="IndelGenotyper">
<main-class name="org.broadinstitute.sting.gatk.CommandLineGATK" />
<resource-bundle file="StingText.properties" />
<dependencies>
<!-- Local realignment around indels -->
<class name="org.broadinstitute.sting.gatk.walkers.indels.IndelGenotyperV2Walker" />
</dependencies>
</executable>
</package>

View File

@ -1,12 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<package name="LocalRealignmentAroundIndels">
<executable name="LocalRealignmentAroundIndels">
<main-class name="org.broadinstitute.sting.gatk.CommandLineGATK" />
<resource-bundle file="StingText.properties" />
<dependencies>
<!-- Local realignment around indels -->
<class name="org.broadinstitute.sting.gatk.walkers.indels.RealignerTargetCreator" />
<class name="org.broadinstitute.sting.gatk.walkers.indels.IndelRealigner" />
</dependencies>
</executable>
</package>

View File

@ -1,18 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<package name="QualityScoresRecalibration">
<executable name="QualityScoresRecalibration">
<main-class name="org.broadinstitute.sting.gatk.CommandLineGATK" />
<resource-bundle file="StingText.properties" />
<dependencies>
<!-- Quality scores recalibration -->
<class name="org.broadinstitute.sting.gatk.walkers.recalibration.CovariateCounterWalker" />
<class name="org.broadinstitute.sting.gatk.walkers.recalibration.TableRecalibrationWalker" />
<!-- Recalibration Covariates -->
<class name="org.broadinstitute.sting.gatk.walkers.recalibration.DinucCovariate" />
<class name="org.broadinstitute.sting.gatk.walkers.recalibration.QualityScoreCovariate" />
<class name="org.broadinstitute.sting.gatk.walkers.recalibration.CycleCovariate" />
<class name="org.broadinstitute.sting.gatk.walkers.recalibration.ReadGroupCovariate" />
<class name="org.broadinstitute.sting.gatk.walkers.recalibration.TileCovariate" />
</dependencies>
</executable>
</package>

View File

@ -1,13 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<package name="RMDIndexer">
<executable name="RMDIndexer">
<main-class name="org.broadinstitute.sting.gatk.refdata.indexer.RMDIndexer" />
<resource-bundle file="StingText.properties" />
<dependencies>
<package name="org.broad.tribble.*" />
<package name="org.broadinstitute.sting.gatk.refdata.features.*" />
<!-- the class itself -->
<class name="org.broadinstitute.sting.gatk.refdata.indexer.RMDIndexer" />
</dependencies>
</executable>
</package>

View File

@ -1,11 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<package name="UnifiedGenotyper">
<executable name="UnifiedGenotyper">
<main-class name="org.broadinstitute.sting.gatk.CommandLineGATK" />
<resource-bundle file="StingText.properties" />
<dependencies>
<!-- Unified genotyper -->
<class name="org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyper" />
</dependencies>
</executable>
</package>

View File

@ -1,26 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<package name="VariantAnnotator">
<executable name="VariantAnnotator">
<main-class name="org.broadinstitute.sting.gatk.CommandLineGATK" />
<resource-bundle file="StingText.properties" />
<dependencies>
<!-- Variant annotator -->
<class name="org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotator" />
<!-- The interfacess -->
<class name="org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationType" />
<class name="org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation" />
<class name="org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation" />
<class name="org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation" />
<!-- The annotations -->
<class name="org.broadinstitute.sting.gatk.walkers.annotator.DepthPerAlleleBySample" />
<class name="org.broadinstitute.sting.gatk.walkers.annotator.DepthOfCoverage" />
<class name="org.broadinstitute.sting.gatk.walkers.annotator.HomopolymerRun" />
<class name="org.broadinstitute.sting.gatk.walkers.annotator.MappingQualityZero" />
<class name="org.broadinstitute.sting.gatk.walkers.annotator.QualByDepth" />
<class name="org.broadinstitute.sting.gatk.walkers.annotator.RMSMappingQuality" />
<class name="org.broadinstitute.sting.gatk.walkers.annotator.SpanningDeletions" />
<class name="org.broadinstitute.sting.gatk.walkers.annotator.HaplotypeScore" />
<class name="org.broadinstitute.sting.gatk.walkers.annotator.ChromosomeCounts" />
</dependencies>
</executable>
</package>

View File

@ -1,18 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<package name="VariantEval">
<executable name="VariantEval">
<main-class name="org.broadinstitute.sting.gatk.CommandLineGATK" />
<resource-bundle file="StingText.properties" />
<dependencies>
<class name="org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker" />
<class name="org.broadinstitute.sting.gatk.walkers.varianteval.CountVariants" />
<class name="org.broadinstitute.sting.gatk.walkers.varianteval.CompOverlap" />
<class name="org.broadinstitute.sting.gatk.walkers.varianteval.GenotypeConcordance" />
<class name="org.broadinstitute.sting.gatk.walkers.varianteval.MendelianViolationEvaluator" />
<class name="org.broadinstitute.sting.gatk.walkers.varianteval.TiTvVariantEvaluator" />
<class name="org.broadinstitute.sting.gatk.walkers.varianteval.ValidationRate" />
<class name="org.broadinstitute.sting.gatk.walkers.varianteval.VariantQualityScore" />
<class name="org.broadinstitute.sting.gatk.walkers.varianteval.CountFunctionalClasses" />
</dependencies>
</executable>
</package>

View File

@ -1,13 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<package name="VariantFiltration">
<executable name="VariantFiltration">
<main-class name="org.broadinstitute.sting.gatk.CommandLineGATK" />
<resource-bundle file="StingText.properties" />
<dependencies>
<!-- Workaround - depend on the logger impl required by JEXL -->
<package name="org.apache.commons.logging.impl" />
<!-- Variant filtration -->
<class name="org.broadinstitute.sting.gatk.walkers.filters.VariantFiltrationWalker" />
</dependencies>
</executable>
</package>

View File

@ -1,12 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<package name="VariantRecalibration">
<executable name="VariantRecalibration">
<main-class name="org.broadinstitute.sting.gatk.CommandLineGATK" />
<resource-bundle file="StingText.properties" />
<dependencies>
<!-- Variant recalibration -->
<class name="org.broadinstitute.sting.gatk.walkers.variantrecalibration.GenerateVariantClustersWalker" />
<class name="org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibrator" />
</dependencies>
</executable>
</package>

View File

@ -248,12 +248,10 @@ class MethodsDevelopmentCallingPipeline extends QScript {
this.reference_sequence = t.reference
this.intervalsString ++= List(t.intervals)
this.input :+= ( if ( goldStandard ) { t.goldStandard_VCF } else { t.rawVCF } )
this.training :+= new TaggedFile( t.hapmapFile, "prior=15.0")
this.truth :+= new TaggedFile( t.hapmapFile, "prior=15.0")
this.training :+= new TaggedFile( omni_b37, "prior=12.0")
this.truth :+= new TaggedFile( omni_b37, "prior=12.0")
this.training :+= new TaggedFile( training_1000G, "prior=10.0" )
this.known :+= new TaggedFile( t.dbsnpFile, "prior=2.0" )
this.resource :+= new TaggedFile( t.hapmapFile, "training=true,truth=true,prior=15.0" )
this.resource :+= new TaggedFile( omni_b37, "training=true,truth=true,prior=12.0" )
this.resource :+= new TaggedFile( training_1000G, "training=true,prior=10.0" )
this.resource :+= new TaggedFile( t.dbsnpFile, "known=true,prior=2.0" )
this.resource :+= new TaggedFile( projectConsensus_1000G, "prior=8.0" )
this.use_annotation ++= List("QD", "HaplotypeScore", "MQRankSum", "ReadPosRankSum", "MQ", "FS")
if(t.nSamples >= 10) {