Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Guillermo del Angel 2012-01-23 09:23:03 -05:00
commit 31d2f04368
23 changed files with 318 additions and 116 deletions

View File

@ -281,6 +281,10 @@
<equals arg1="${gatk.target}" arg2="private" casesensitive="false" />
</condition>
<condition property="include.external">
<available file="${external.dir}"/>
</condition>
<condition property="include.contracts">
<equals arg1="${use.contracts}" arg2="true" />
</condition>
@ -331,7 +335,7 @@
</javac>
</target>
<target name="gatk.compile.external.source" depends="gatk.compile.public.source,gatk.compile.private.source">
<target name="gatk.compile.external.source" depends="gatk.compile.public.source,gatk.compile.private.source" if="include.external">
<subant target="compile" genericantfile="build.xml">
<property name="build.dir" value="${external.build.dir}" />
<property name="dist.dir" value="${external.dist.dir}" />
@ -761,6 +765,7 @@
<property name="java.test.classes" value="${build.dir}/java/testclasses"/>
<property name="java.public.test.classes" value="${java.test.classes}/public"/>
<property name="java.private.test.classes" value="${java.test.classes}/private"/>
<property name="java.external.test.classes" value="${java.test.classes}/external"/>
<property name="java.public.test.sources" value="${public.dir}/java/test"/>
<property name="java.private.test.sources" value="${private.dir}/java/test"/>
<property name="scala.test.classes" value="${build.dir}/scala/testclasses"/>
@ -811,7 +816,23 @@
</javac>
</target>
<target name="test.java.compile" depends="test.java.public.compile, test.java.private.compile"/>
<target name="test.java.external.compile" depends="dist,test.init.compile,test.java.public.compile" if="include.external">
<mkdir dir="${java.external.test.classes}"/>
<echo message="Sting: Compiling external test cases!"/>
<javac fork="true" memoryMaximumSize="512m" destdir="${java.external.test.classes}" debug="true" optimize="on" tempdir="${java.io.tmpdir}" srcdir="${external.dir}">
<include name="*/test/**/*.java"/>
<classpath>
<path refid="external.dependencies" />
<pathelement location="${java.public.test.classes}"/>
<pathelement location="${java.classes}"/>
<pathelement location="${java.contracts}"/>
<pathelement location="${testng.jar}"/>
</classpath>
<compilerarg value="-proc:none"/>
</javac>
</target>
<target name="test.java.compile" depends="test.java.public.compile, test.java.private.compile, test.java.external.compile"/>
<target name="test.scala.public.compile" depends="test.java.compile,scala.compile" if="scala.include">
<mkdir dir="${scala.public.test.classes}"/>
@ -852,6 +873,7 @@
<pathelement location="${java.contracts}" />
<pathelement location="${java.public.test.classes}" />
<pathelement location="${java.private.test.classes}" />
<pathelement location="${java.external.test.classes}" />
<pathelement location="${scala.public.test.classes}" />
<pathelement location="${scala.private.test.classes}" />
<pathelement location="${R.tar.dir}" />
@ -934,6 +956,9 @@
<classfileset dir="${java.private.test.classes}" erroronmissingdir="false">
<include name="**/@{testtype}.class" if="include.private"/>
</classfileset>
<classfileset dir="${java.external.test.classes}" erroronmissingdir="false">
<include name="**/@{testtype}.class" if="include.external"/>
</classfileset>
<classfileset dir="${scala.public.test.classes}" erroronmissingdir="false">
<include name="**/@{testtype}*.class" if="scala.include"/>
</classfileset>

View File

@ -443,14 +443,22 @@ public class GenomeAnalysisEngine {
if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM)
throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available.");
if(walker instanceof LocusWalker || walker instanceof ActiveRegionWalker) {
if(walker instanceof LocusWalker) {
if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
if(intervals == null)
return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer());
else
return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer());
}
}
else if(walker instanceof ActiveRegionWalker) {
if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
if(intervals == null)
return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer());
else
return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new LocusShardBalancer());
}
else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) {
// Apply special validation to read pair walkers.
if(walker instanceof ReadPairWalker) {

View File

@ -77,7 +77,7 @@ public class LinearMicroScheduler extends MicroScheduler {
done = walker.isDone();
}
// Special function call to empty out the work queue. Ugly for now but will be cleaned up when we push this functionality more into the engine
// Special function call to empty out the work queue. Ugly for now but will be cleaned up when we eventually push this functionality more into the engine
if( traversalEngine instanceof TraverseActiveRegions ) {
final Object result = ((TraverseActiveRegions) traversalEngine).endTraversal(walker, accumulator.getReduceInit());
accumulator.accumulate(null, result); // Assumes only used with StandardAccumulator

View File

@ -7,10 +7,9 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.providers.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -46,13 +45,15 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
T sum) {
logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider));
LocusView locusView = getLocusView( walker, dataProvider );
final LocusView locusView = getLocusView( walker, dataProvider );
final GenomeLocSortedSet initialIntervals = engine.getIntervals(); // BUGBUG: unfortunate inefficiency that needs to be removed
int minStart = Integer.MAX_VALUE;
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
int minStart = Integer.MAX_VALUE;
final ArrayList<ActiveRegion> isActiveList = new ArrayList<ActiveRegion>();
//ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider );
@ -90,9 +91,11 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
// Call the walkers isActive function for this locus and add them to the list to be integrated later
final boolean isActive = walker.isActive( tracker, refContext, locus );
isActiveList.add( new ActiveRegion(location, isActive, engine.getGenomeLocParser()) );
if( initialIntervals.overlaps(location) ) {
final boolean isActive = walker.isActive( tracker, refContext, locus );
isActiveList.add( new ActiveRegion(location, isActive, engine.getGenomeLocParser(), activeRegionExtension ) );
}
// Grab all the previously unseen reads from this pileup and add them to the massive read list
for( final PileupElement p : locus.getBasePileup() ) {
final SAMRecord read = p.getRead();
@ -101,11 +104,20 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
}
}
// If this is the last pileup for this shard then need to calculate the minimum alignment start so that
// we know which active regions in the work queue are now safe to process
// If this is the last pileup for this shard then need to first do a special walker.isActive() call
// and then calculate the minimum alignment start so that we know which active regions in the work queue are now safe to process
if( !locusView.hasNext() ) {
// Call the walkers isActive function for this locus and add them to the list to be integrated later
if( initialIntervals.overlaps(location) ) {
final boolean isActive = walker.isActive( tracker, refContext, locus );
isActiveList.add( new ActiveRegion(location, isActive, engine.getGenomeLocParser(), activeRegionExtension ) );
}
for( final PileupElement p : locus.getBasePileup() ) {
final SAMRecord read = p.getRead();
if( !myReads.contains(read) ) {
myReads.add(read);
}
if( read.getAlignmentStart() < minStart ) { minStart = read.getAlignmentStart(); }
}
}
@ -117,11 +129,14 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
final ArrayList<ActiveRegion> activeRegions = integrateActiveList( isActiveList );
logger.debug("Integrated " + isActiveList.size() + " isActive calls into " + activeRegions.size() + " regions." );
workQueue.addAll( activeRegions );
}
while( workQueue.peek().getLocation().getStop() < minStart ) {
final ActiveRegion activeRegion = workQueue.remove();
sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker );
// Since we've sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
if( !workQueue.isEmpty() ) {
while( workQueue.peek().getExtendedLoc().getStop() < minStart || !workQueue.peek().getExtendedLoc().getContig().equals(dataProvider.getLocus().getContig()) ) {
final ActiveRegion activeRegion = workQueue.remove();
sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker );
}
}
}
return sum;
@ -151,23 +166,27 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
bestRegion = otherRegionToTest;
}
}
bestRegion.add( (GATKSAMRecord) read, true );
bestRegion.add( (GATKSAMRecord) read );
// The read is also added to all other regions in which it overlaps but marked as non-primary
if( !bestRegion.equals(activeRegion) ) {
activeRegion.add( (GATKSAMRecord) read, false );
}
for( final ActiveRegion otherRegionToTest : workQueue ) {
if( !bestRegion.equals(otherRegionToTest) && otherRegionToTest.getLocation().overlapsP( readLoc ) ) {
activeRegion.add( (GATKSAMRecord) read, false );
if( walker.wantsNonPrimaryReads() ) {
if( !bestRegion.equals(activeRegion) ) {
activeRegion.add( (GATKSAMRecord) read );
}
for( final ActiveRegion otherRegionToTest : workQueue ) {
if( !bestRegion.equals(otherRegionToTest) && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) {
activeRegion.add( (GATKSAMRecord) read );
}
}
}
placedReads.add( read );
} else if( activeRegion.getExtendedLoc().overlapsP( readLoc ) && walker.wantsNonPrimaryReads() ) {
activeRegion.add( (GATKSAMRecord) read );
}
}
reads.removeAll( placedReads ); // remove all the reads which have been placed into their active region
logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLocation());
logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc());
final M x = walker.map( activeRegion, null ); // BUGBUG: tracker needs to be filled in and passed to the walker
return walker.reduce( x, sum );
}
@ -178,8 +197,8 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
* @param dataProvider Data which which to drive the locus view.
* @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal.
*/
private LocusView getLocusView( Walker<M,T> walker, LocusShardDataProvider dataProvider ) {
DataSource dataSource = WalkerManager.getWalkerDataSource(walker);
private LocusView getLocusView( final Walker<M,T> walker, final LocusShardDataProvider dataProvider ) {
final DataSource dataSource = WalkerManager.getWalkerDataSource(walker);
if( dataSource == DataSource.READS )
return new CoveredLocusView(dataProvider);
else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers )
@ -190,24 +209,32 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource);
}
// integrate active regions into contiguous chunks based on active status
// integrate active regions into contiguous chunks with identical active status
private ArrayList<ActiveRegion> integrateActiveList( final ArrayList<ActiveRegion> activeList ) {
final ArrayList<ActiveRegion> returnList = new ArrayList<ActiveRegion>();
ActiveRegion prevLocus = activeList.remove(0);
ActiveRegion startLocus = prevLocus;
for( final ActiveRegion thisLocus : activeList ) {
if( prevLocus.isActive != thisLocus.isActive ) {
returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()),
prevLocus.isActive, engine.getGenomeLocParser() ) );
startLocus = thisLocus;
if( activeList.size() == 0 ) {
return returnList;
} else if( activeList.size() == 1 ) {
returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(activeList.get(0).getLocation().getContig(), activeList.get(0).getLocation().getStart(), activeList.get(0).getLocation().getStart()),
activeList.get(0).isActive, engine.getGenomeLocParser(), activeList.get(0).getExtension() ) );
return returnList;
} else {
ActiveRegion prevLocus = activeList.get(0);
ActiveRegion startLocus = prevLocus;
for( final ActiveRegion thisLocus : activeList ) {
if( prevLocus.isActive != thisLocus.isActive || !prevLocus.getLocation().contiguousP( thisLocus.getLocation() ) ) {
returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()),
prevLocus.isActive, engine.getGenomeLocParser(), startLocus.getExtension() ) );
startLocus = thisLocus;
}
prevLocus = thisLocus;
}
prevLocus = thisLocus;
// output the last region if necessary
if( startLocus != prevLocus ) {
returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()),
prevLocus.isActive, engine.getGenomeLocParser(), startLocus.getExtension() ) );
}
return returnList;
}
// output the last region if necessary
if( startLocus != prevLocus ) {
returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()),
prevLocus.isActive, engine.getGenomeLocParser() ) );
}
return returnList;
}
}

View File

@ -0,0 +1,19 @@
package org.broadinstitute.sting.gatk.walkers;
import java.lang.annotation.Documented;
import java.lang.annotation.Inherited;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
/**
* Describes the size of the buffer region that is added to each active region when pulling in covered reads.
* User: rpoplin
* Date: 1/18/12
*/
@Documented
@Inherited
@Retention(RetentionPolicy.RUNTIME)
public @interface ActiveRegionExtension {
public int extension() default 0;
}

View File

@ -1,13 +1,26 @@
package org.broadinstitute.sting.gatk.walkers;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter;
import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter;
import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter;
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
import java.util.ArrayList;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* Base class for all the Active Region Walkers.
* User: rpoplin
* Date: 12/7/11
*/
@ -15,15 +28,33 @@ import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
@By(DataSource.READS)
@Requires({DataSource.READS, DataSource.REFERENCE_BASES})
@PartitionBy(PartitionType.READ)
@ActiveRegionExtension(extension=50)
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class})
public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
// Do we actually want to operate on the context?
public boolean filter(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
return true; // We are keeping all the reads
}
public boolean wantsNonPrimaryReads() {
return false;
}
// Determine active status over the AlignmentContext
public abstract boolean isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context);
// Map over the ActiveRegion
public abstract MapType map(final ActiveRegion activeRegion, final ReadMetaDataTracker metaDataTracker);
public final GenomeLocSortedSet extendIntervals( final GenomeLocSortedSet intervals, final GenomeLocParser genomeLocParser, IndexedFastaSequenceFile reference ) {
final int activeRegionExtension = this.getClass().getAnnotation(ActiveRegionExtension.class).extension();
final List<GenomeLoc> allIntervals = new ArrayList<GenomeLoc>();
for( final GenomeLoc interval : intervals.toList() ) {
final int start = Math.max( 1, interval.getStart() - activeRegionExtension );
final int stop = Math.min( reference.getSequenceDictionary().getSequence(interval.getContig()).getSequenceLength(), interval.getStop() + activeRegionExtension );
allIntervals.add( genomeLocParser.createGenomeLoc(interval.getContig(), start, stop) );
}
return IntervalUtils.sortAndMergeIntervals(genomeLocParser, allIntervals, IntervalMergingRule.ALL);
}
}

View File

@ -54,7 +54,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
private static final double MIN_PVALUE = 1E-320;
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
if ( ! vc.isVariant() || vc.isFiltered() )
if ( !vc.isVariant() )
return null;
int[][] table;
@ -73,7 +73,6 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
if ( pvalue == null )
return null;
// use Math.abs to prevent -0's
Map<String, Object> map = new HashMap<String, Object>();
map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pvalue)));
return map;

View File

@ -241,6 +241,11 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
String alleleA = beagleGenotypePairs.get(0);
String alleleB = beagleGenotypePairs.get(1);
if ( alleleA.equals("null") || alleleB.equals("null") ) {
logger.warn("Beagle produced 'null' alleles at location "+ref.getLocus().toString()+". Ignoring.");
return 0;
}
// Beagle always produces genotype strings based on the strings we input in the likelihood file.
String refString = vc_input.getReference().getDisplayString();
if (refString.length() == 0) // ref was null
@ -315,8 +320,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
og = a1+"/"+a2;
// See if Beagle switched genotypes
if (!((bglAlleleA.equals(originalAlleleA) && bglAlleleB.equals(originalAlleleB) ||
(bglAlleleA.equals(originalAlleleB) && bglAlleleB.equals(originalAlleleA))))){
if (! originalAlleleA.equals(Allele.NO_CALL) && beagleSwitchedGenotypes(bglAlleleA,originalAlleleA,bglAlleleB,originalAlleleB)){
originalAttributes.put("OG",og);
numGenotypesChangedByBeagle++;
}
@ -359,6 +363,11 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
return 1;
}
private boolean beagleSwitchedGenotypes(Allele bglAlleleA, Allele originalAlleleA, Allele bglAlleleB, Allele originalAlleleB) {
return !((bglAlleleA.equals(originalAlleleA) && bglAlleleB.equals(originalAlleleB) ||
(bglAlleleA.equals(originalAlleleB) && bglAlleleB.equals(originalAlleleA))));
}
public Integer reduceInit() {
return 0; // Nothing to do here
}

View File

@ -139,6 +139,12 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
@Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, missing values should be considered failing the expression", required=false)
protected Boolean FAIL_MISSING_VALUES = false;
/**
* Invalidate previous filters applied to the VariantContext, applying only the filters here
*/
@Argument(fullName="invalidatePreviousFilters",doc="Remove previous filters applied to the VCF",required=false)
boolean invalidatePrevious = false;
// JEXL expressions for the filters
List<VariantContextUtils.JexlVCMatchExp> filterExps;
List<VariantContextUtils.JexlVCMatchExp> genotypeFilterExps;
@ -215,6 +221,9 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
for ( VariantContext vc : VCs ) {
if ( invalidatePrevious ) {
vc = (new VariantContextBuilder(vc)).filters(new HashSet<String>()).make();
}
// filter based on previous mask position
if ( previousMaskPosition != null && // we saw a previous mask site
previousMaskPosition.getContig().equals(vc.getChr()) && // it's on the same contig

View File

@ -155,7 +155,9 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/////////////////////////////
/**
* This algorithm treats every reference mismatch as an indication of error. However, real genetic variation is expected to mismatch the reference,
* so it is critical that a database of known polymorphic sites is given to the tool in order to skip over those sites.
* so it is critical that a database of known polymorphic sites is given to the tool in order to skip over those sites. This tool accepts any number of RodBindings (VCF, Bed, etc.)
* for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites.
* Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument.
*/
@Input(fullName="knownSites", shortName = "knownSites", doc="A database of known polymorphic sites to skip over in the recalibration algorithm", required=false)
public List<RodBinding<Feature>> knownSites = Collections.emptyList();

View File

@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.List;
@ -126,7 +125,7 @@ public class VariantRecalibratorEngine {
iteration++;
model.maximizationStep( data );
currentChangeInMixtureCoefficients = model.normalizePMixtureLog10();
model.expectationStep(data);
model.expectationStep( data );
if( iteration % 5 == 0 ) { // cut down on the number of output lines so that users can read the warning messages
logger.info("Finished iteration " + iteration + ". \tCurrent change in mixture coefficients = " + String.format("%.5f", currentChangeInMixtureCoefficients));
}

View File

@ -105,7 +105,7 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
* and each named argument will be labeled as such in the output (i.e., set=name rather than
* set=variants2). The order of arguments does not matter unless except for the naming, so
* if you provide an rod priority list and no explicit names than variants, variants2, etc
* are techincally order dependent. It is strongly recommended to provide explicit names when
* are technically order dependent. It is strongly recommended to provide explicit names when
* a rod priority list is provided.
*/
@Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true)

View File

@ -467,6 +467,6 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
}
public long sizeOfOverlap( final GenomeLoc that ) {
return ( this.overlapsP(that) ? Math.min( getStop(), that.getStop() ) - Math.max( getStart(), that.getStart() ) : 0L );
return ( this.overlapsP(that) ? Math.min( getStop(), that.getStop() ) - Math.max( getStart(), that.getStart() ) + 1L : 0L );
}
}

View File

@ -127,6 +127,21 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
return mArray.isEmpty();
}
/**
* Determine if the given loc overlaps any loc in the sorted set
*
* @param loc the location to test
* @return
*/
public boolean overlaps(final GenomeLoc loc) {
for(final GenomeLoc e : mArray) {
if(e.overlapsP(loc)) {
return true;
}
}
return false;
}
/**
* add a genomeLoc to the collection, simply inserting in order into the set
*

View File

@ -1,19 +0,0 @@
package org.broadinstitute.sting.utils.activeregion;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: 1/4/12
*/
public class ActiveRead {
final public GATKSAMRecord read;
final public boolean isPrimaryRegion;
ActiveRead( final GATKSAMRecord read, final boolean isPrimaryRegion ) {
this.read = read;
this.isPrimaryRegion = isPrimaryRegion;
}
}

View File

@ -16,40 +16,49 @@ import java.util.ArrayList;
public class ActiveRegion implements HasGenomeLocation {
private final ArrayList<ActiveRead> reads = new ArrayList<ActiveRead>();
private byte[] reference = null;
private final GenomeLoc loc;
private GenomeLoc referenceLoc = null;
private final ArrayList<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
private final GenomeLoc activeRegionLoc;
private final GenomeLoc extendedLoc;
private final int extension;
private GenomeLoc fullExtentReferenceLoc = null;
private final GenomeLocParser genomeLocParser;
public final boolean isActive;
public ActiveRegion( final GenomeLoc loc, final boolean isActive, final GenomeLocParser genomeLocParser ) {
this.loc = loc;
public ActiveRegion( final GenomeLoc activeRegionLoc, final boolean isActive, final GenomeLocParser genomeLocParser, final int extension ) {
this.activeRegionLoc = activeRegionLoc;
this.isActive = isActive;
this.genomeLocParser = genomeLocParser;
referenceLoc = loc;
this.extension = extension;
extendedLoc = genomeLocParser.createGenomeLoc(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension);
fullExtentReferenceLoc = extendedLoc;
}
// add each read to the bin and extend the reference genome loc if needed
public void add( final GATKSAMRecord read, final boolean isPrimaryRegion ) {
referenceLoc = referenceLoc.union( genomeLocParser.createGenomeLoc( read ) );
reads.add( new ActiveRead(read, isPrimaryRegion) );
// add each read to the bin and extend the reference genome activeRegionLoc if needed
public void add( final GATKSAMRecord read ) {
fullExtentReferenceLoc = fullExtentReferenceLoc.union( genomeLocParser.createGenomeLoc( read ) );
reads.add( read );
}
public ArrayList<ActiveRead> getReads() { return reads; }
public ArrayList<GATKSAMRecord> getReads() { return reads; }
public byte[] getReference( final IndexedFastaSequenceFile referenceReader ) {
// set up the reference if we haven't done so yet
if ( reference == null ) {
reference = referenceReader.getSubsequenceAt(referenceLoc.getContig(), referenceLoc.getStart(), referenceLoc.getStop()).getBases();
}
return reference;
return getReference( referenceReader, 0 );
}
public GenomeLoc getLocation() { return loc; }
public GenomeLoc getReferenceLocation() { return referenceLoc; }
public byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding ) {
return referenceReader.getSubsequenceAt( fullExtentReferenceLoc.getContig(),
Math.max(1, fullExtentReferenceLoc.getStart() - padding),
Math.min(referenceReader.getSequenceDictionary().getSequence(fullExtentReferenceLoc.getContig()).getSequenceLength(), fullExtentReferenceLoc.getStop() + padding) ).getBases();
}
@Override
public GenomeLoc getLocation() { return activeRegionLoc; }
public GenomeLoc getExtendedLoc() { return extendedLoc; }
public GenomeLoc getReferenceLoc() { return fullExtentReferenceLoc; }
public int getExtension() { return extension; }
public int size() { return reads.size(); }
public void clearReads() { reads.clear(); }
public void remove( final GATKSAMRecord read ) { reads.remove( read ); }
public void removeAll( final ArrayList<GATKSAMRecord> readsToRemove ) { reads.removeAll( readsToRemove ); }
}

View File

@ -1,9 +1,15 @@
package org.broadinstitute.sting.utils.fragments;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.*;
@ -121,4 +127,63 @@ public class FragmentUtils {
return create(reads, reads.size(), SamRecordGetter);
}
public final static List<GATKSAMRecord> mergeOverlappingPairedFragments( List<GATKSAMRecord> overlappingPair ) {
final byte MIN_QUAL_BAD_OVERLAP = 16;
if( overlappingPair.size() != 2 ) { throw new ReviewedStingException("Found overlapping pair with " + overlappingPair.size() + " reads, but expecting exactly 2."); }
GATKSAMRecord firstRead = overlappingPair.get(0);
GATKSAMRecord secondRead = overlappingPair.get(1);
if( !(secondRead.getUnclippedStart() <= firstRead.getUnclippedEnd() && secondRead.getUnclippedStart() >= firstRead.getUnclippedStart() && secondRead.getUnclippedEnd() >= firstRead.getUnclippedEnd()) ) {
firstRead = overlappingPair.get(1);
secondRead = overlappingPair.get(0);
}
if( !(secondRead.getUnclippedStart() <= firstRead.getUnclippedEnd() && secondRead.getUnclippedStart() >= firstRead.getUnclippedStart() && secondRead.getUnclippedEnd() >= firstRead.getUnclippedEnd()) ) {
return overlappingPair; // can't merge them, yet: AAAAAAAAAAA-BBBBBBBBBBB-AAAAAAAAAAAAAA, B is contained entirely inside A
}
if( firstRead.getCigarString().contains("I") || firstRead.getCigarString().contains("D") || secondRead.getCigarString().contains("I") || secondRead.getCigarString().contains("D") ) {
return overlappingPair; // fragments contain indels so don't merge them
}
final Pair<Integer, Boolean> pair = ReadUtils.getReadCoordinateForReferenceCoordinate(firstRead, secondRead.getSoftStart());
final int firstReadStop = ( pair.getSecond() ? pair.getFirst() + 1 : pair.getFirst() );
final int numBases = firstReadStop + secondRead.getReadLength();
final byte[] bases = new byte[numBases];
final byte[] quals = new byte[numBases];
final byte[] firstReadBases = firstRead.getReadBases();
final byte[] firstReadQuals = firstRead.getBaseQualities();
final byte[] secondReadBases = secondRead.getReadBases();
final byte[] secondReadQuals = secondRead.getBaseQualities();
for(int iii = 0; iii < firstReadStop; iii++) {
bases[iii] = firstReadBases[iii];
quals[iii] = firstReadQuals[iii];
}
for(int iii = firstReadStop; iii < firstRead.getReadLength(); iii++) {
if( firstReadQuals[iii] > MIN_QUAL_BAD_OVERLAP && secondReadQuals[iii-firstReadStop] > MIN_QUAL_BAD_OVERLAP && firstReadBases[iii] != secondReadBases[iii-firstReadStop] ) {
return overlappingPair;// high qual bases don't match exactly, probably indel in only one of the fragments, so don't merge them
}
bases[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadBases[iii] : secondReadBases[iii-firstReadStop] );
quals[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadQuals[iii] : secondReadQuals[iii-firstReadStop] );
}
for(int iii = firstRead.getReadLength(); iii < numBases; iii++) {
bases[iii] = secondReadBases[iii-firstReadStop];
quals[iii] = secondReadQuals[iii-firstReadStop];
}
final GATKSAMRecord returnRead = new GATKSAMRecord(firstRead.getHeader());
returnRead.setAlignmentStart(firstRead.getUnclippedStart());
returnRead.setReadBases( bases );
returnRead.setBaseQualities( quals );
returnRead.setReadGroup( firstRead.getReadGroup() );
returnRead.setReferenceName( firstRead.getReferenceName() );
final CigarElement c = new CigarElement(bases.length, CigarOperator.M);
final ArrayList<CigarElement> cList = new ArrayList<CigarElement>();
cList.add(c);
returnRead.setCigar( new Cigar( cList ));
returnRead.setMappingQuality( firstRead.getMappingQuality() );
final ArrayList<GATKSAMRecord> returnList = new ArrayList<GATKSAMRecord>();
returnList.add(returnRead);
return returnList;
}
}

View File

@ -608,7 +608,7 @@ public class ReadUtils {
* Example: Locus => {read1, read2, ..., readN}
*
* - Second: readToLocusMap -- a HashMap that describes for each read what loci it contributes to the coverage.
* Note: Locus is a boolean array, indexed from 0 (= startLocation) to N (= stopLocation), with true meaning it contributes to the coverage.
* Note: Locus is a boolean array, indexed from 0 (= startLocation) to N (= stopLocation), with value==true meaning it contributes to the coverage.
* Example: Read => {true, true, false, ... false}
*
* @param readList the list of reads to generate the association mappings

View File

@ -464,7 +464,11 @@ public class VariantContextUtils {
/**
* Requires all records present at site to be unfiltered. VCF files that don't contain the record don't influence this.
*/
KEEP_IF_ALL_UNFILTERED
KEEP_IF_ALL_UNFILTERED,
/**
* If any record is present at this site (regardless of possibly being filtered), then all such records are kept and the filters are reset.
*/
KEEP_UNCONDITIONAL
}
/**
@ -635,7 +639,7 @@ public class VariantContextUtils {
}
// if at least one record was unfiltered and we want a union, clear all of the filters
if ( filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size() )
if ( (filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size()) || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL )
filters.clear();

View File

@ -41,7 +41,7 @@ public class BeagleIntegrationTest extends WalkerTest {
"--beagleR2:BEAGLE " + beagleValidationDataLocation + "inttestbgl.r2 " +
"--beagleProbs:BEAGLE " + beagleValidationDataLocation + "inttestbgl.gprobs " +
"--beaglePhased:BEAGLE " + beagleValidationDataLocation + "inttestbgl.phased " +
"-o %s -NO_HEADER", 1, Arrays.asList("b445d280fd8fee1eeb4aacb3f5a54847"));
"-o %s -NO_HEADER", 1, Arrays.asList("6d0f213918e3b9ea33bc2f8a51a462f1"));
executeTest("test BeagleOutputToVCF", spec);
}
@ -72,7 +72,7 @@ public class BeagleIntegrationTest extends WalkerTest {
"--beagleR2:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.r2 "+
"--beagleProbs:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.gprobs.bgl "+
"--beaglePhased:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.phased.bgl "+
"-L 20:1-70000 -o %s -NO_HEADER ",1,Arrays.asList("51a57ea565176edd96d907906914b0ee"));
"-L 20:1-70000 -o %s -NO_HEADER ",1,Arrays.asList("ddbf490f1d9f37cc79fe414c8d40886f"));
executeTest("testBeagleChangesSitesToRef",spec);
}

View File

@ -44,7 +44,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testWithAllelesPassedIn2() {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
Arrays.asList("43e7a17d95b1a0cf72e669657794d802"));
Arrays.asList("1899bdb956c62bbcbf160b18cd3aea60"));
executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2);
}
@ -275,7 +275,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
+ validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("36ce53ae4319718ad9c8ae391deebc8c"));
Arrays.asList("320f61c87253aba77d6dc782242cba8b"));
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2);
}

View File

@ -26,23 +26,23 @@ class VCFExtractIntervals(inVCF: File, outList: File, useFilterSites: Boolean) e
var cur : String = null
if ( elems.hasNext ) {
cur = elems.next
} else {
out.printf("%s%n",prev)
}
while ( elems.hasNext ) {
out.printf("%s%n",prev)
while ( cur.equals(prev) && elems.hasNext && !cur.equals("") ) {
cur = elems.next
}
if ( ! cur.equals(prev) ) {
if ( elems.hasNext ) {
prev = cur
while ( elems.hasNext ) {
out.printf("%s%n",prev)
while ( cur.equals(prev) && elems.hasNext && !cur.equals("") ) {
cur = elems.next
} else {
out.printf("%s%n",cur)
}
if ( ! cur.equals(prev) ) {
if ( elems.hasNext ) {
prev = cur
cur = elems.next
}
}
}
out.printf("%s%n",prev)
out.printf("%s%n",cur)
} else {
out.printf("%s%n",prev)
}
out.close

View File

@ -6,7 +6,7 @@ import collection.JavaConversions._
import org.broadinstitute.sting.commandline._
import java.io.{PrintWriter, PrintStream, File}
class VCFExtractSamples(inVCF: File, outVCF: File, samples: List[String]) extends InProcessFunction {
class VCFExtractSamples(inVCF: File, outVCF: File, samples: List[String]) extends InProcessFunction {
def this(in: File, out: File, samples: File) = this(in,out, (new XReadLines(samples)).readLines.toList)
@Input(doc="VCF from which to extract samples") var inputVCF : File = inVCF