Enabling correct high-performance ROD walker and moved VariantEval over to it. Performance improvements in variantEval in general. See wiki for full description

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1890 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-10-20 23:31:13 +00:00
parent 4a8a6468be
commit caa3187af8
6 changed files with 80 additions and 69 deletions

View File

@ -543,7 +543,7 @@ public class GenomeAnalysisEngine {
ShardStrategyFactory.SHATTER_STRATEGY shardType;
if (walker instanceof LocusWalker) {
if (walker instanceof RodWalker) SHARD_SIZE *= 100;
if (walker instanceof RodWalker) SHARD_SIZE *= 1000;
if (intervals != null) {
shardType = (walker.isReduceByInterval()) ?

View File

@ -27,11 +27,6 @@ import net.sf.samtools.SAMRecord;
* A view into the reference-ordered data in the provider.
*/
public class RodLocusView extends LocusView implements ReferenceOrderedView {
/**
* The provider that's supplying our backing data.
*/
//private final ShardDataProvider provider;
/**
* The data sources along with their current states.
*/
@ -64,14 +59,19 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView {
List< Iterator<RODRecordList<ReferenceOrderedDatum>> > iterators = new LinkedList< Iterator<RODRecordList<ReferenceOrderedDatum>> >();
for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() ) {
if ( DEBUG ) System.out.printf("Shard is %s%n", loc);
// grab the ROD iterator from the data source, and compute the first location in this shard, forwarding
// the iterator to immediately before it, so that it can be added to the merging iterator primed for
// next() to return the first real ROD in this shard
SeekableRODIterator it = (SeekableRODIterator)dataSource.seek(provider.getShard());
RODRecordList<ReferenceOrderedDatum> x = it.seekForward(loc);
GenomeLoc shardLoc = provider.getShard().getGenomeLoc();
it.seekForward(GenomeLocParser.createGenomeLoc(shardLoc.getContigIndex(), shardLoc.getStart()-1, shardLoc.getStart()-1));
// we need to special case the interval so we don't always think there's a rod at the first location
if ( dataSource.getName().equals(INTERVAL_ROD_NAME) ) {
if ( interval != null )
throw new RuntimeException("BUG: interval local variable already assigned " + interval);
interval = x;
interval = (RODRecordList<ReferenceOrderedDatum>)it.next();
} else {
iterators.add( it );
}
@ -142,30 +142,16 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView {
return rodQueue.allElementsLTE(marker);
}
// private Collection<ReferenceOrderedDatum> getSpanningRods(ReferenceOrderedDatum marker) {
// Collection<ReferenceOrderedDatum> allFromQueue = rodQueue.allElementsLTE(marker);
// Collection<ReferenceOrderedDatum> all = new LinkedList<ReferenceOrderedDatum>(allFromQueue);
//
// Iterator<ReferenceOrderedDatum> it = multiLocusRODs.iterator();
// while ( it.hasNext() ) {
// ReferenceOrderedDatum mlr = it.next();
// if ( mlr.getLocation().overlapsP(marker.getLocation()) ) {
// all.add(mlr);
// } else {
// it.remove(); // no long covering this site
// }
// }
//
// for ( ReferenceOrderedDatum datum : allFromQueue ) {
// if ( ! datum.getLocation().isSingleBP() ) {
// System.out.printf("Adding multi-locus %s%n", datum);
// multiLocusRODs.add(datum);
// }
// }
//
// return all;
// }
/**
* Returns the number of reference bases that have been skipped:
*
* 1 -- since the last processed location if we have one
* 2 -- from the beginning of the shard if this is the first loc
* 3 -- from the last location to the current position
*
* @param currentPos
* @return
*/
private long getSkippedBases( GenomeLoc currentPos ) {
long skippedBases = 0;
@ -179,7 +165,8 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView {
}
if ( skippedBases < -1 ) { // minus 1 value is ok
throw new RuntimeException(String.format("BUG: skipped bases is < 0: cur=%s vs. last=%s", currentPos, lastLoc));
throw new RuntimeException(String.format("BUG: skipped bases=%d is < 0: cur=%s vs. last=%s, shard=%s",
skippedBases, currentPos, lastLoc, shard.getGenomeLoc()));
}
return Math.max(skippedBases, 0);
}
@ -215,8 +202,6 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView {
* Closes the current view.
*/
public void close() {
//rodQueue.close();
rodQueue = null;
tracker = null;
}

View File

@ -49,9 +49,8 @@ public class TraverseLoci extends TraversalEngine {
LocusView locusView = getLocusView( walker, dataProvider );
if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA )
throw new RuntimeException("Engine currently doesn't support RodWalkers");
//if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA )
// throw new RuntimeException("Engine currently doesn't support RodWalkers");
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all

View File

@ -21,19 +21,29 @@ import java.util.List;
*
*/
public class ClusterCounterAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
ArrayList<HashSet<GenomeLoc>> variantsWithClusters;
int minDistanceForFlagging = 5;
/**
* Threshold distances between neighboring SNPs we will track and assess
*/
int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100};
int[] variantsWithClusters = new int[neighborWiseBoundries.length];
/**
* Keep track of the last variation we saw in the stream
*/
Variation lastVariation = null;
/**
* The interval that the last variation occurred in. Don't think that SNPs from different intervals are
* too close together in hybrid selection.
*/
GenomeLoc lastVariantInterval = null;
int nSeen = 0;
public ClusterCounterAnalysis() {
super("cluster_counter_analysis");
variantsWithClusters = new ArrayList<HashSet<GenomeLoc>>(neighborWiseBoundries.length);
for (int i = 0; i < neighborWiseBoundries.length; i++)
variantsWithClusters.add(new HashSet<GenomeLoc>());
}
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
@ -46,26 +56,27 @@ public class ClusterCounterAnalysis extends BasicVariantAnalysis implements Geno
if (lastVariation != null) {
GenomeLoc eL = eval.getLocation();
GenomeLoc lvL = lastVariation.getLocation();
if (eL.getContigIndex() == lvL.getContigIndex()) {
// if we are on the same contig, and we in the same interval
if (eL.getContigIndex() == lvL.getContigIndex() && ! (lastVariantInterval != null && lastVariantInterval.compareTo(interval) != 0)) {
long d = eL.distance(lvL);
if ( lastVariantInterval != null && lastVariantInterval.compareTo(interval) != 0) {
// we're on different intervals
//out.printf("# Excluding %d %s %s vs. %s %s%n", d, eL, interval, lvL, lastVariantInterval);
} else {
nSeen++;
StringBuilder s = new StringBuilder();
for (int i = 0; i < neighborWiseBoundries.length; i++) {
int maxDist = neighborWiseBoundries[i];
s.append(String.format("%d ", d <= maxDist ? maxDist : 0));
if ( d <= maxDist ) {
variantsWithClusters.get(i).add(eL);
}
nSeen++;
StringBuilder s = new StringBuilder();
for (int i = 0; i < neighborWiseBoundries.length; i++) {
int maxDist = neighborWiseBoundries[i];
s.append(String.format("%d ", d <= maxDist ? maxDist : 0));
if ( d <= maxDist ) {
variantsWithClusters[i]++;
}
r = d <= minDistanceForFlagging ? String.format("snp_within_cluster %d %s %s %s", d, eL, lvL, s.toString()) : null;
}
// lookup in master for performance reasons
if ( d <= minDistanceForFlagging && getMaster().includeViolations() )
r = String.format("snp_within_cluster %d %s %s %s", d, eL, lvL, s.toString());
}
}
// eval is now the last variation
lastVariation = eval;
lastVariantInterval = interval;
}
@ -79,7 +90,7 @@ public class ClusterCounterAnalysis extends BasicVariantAnalysis implements Geno
s.add(String.format("description maxDist count"));
for ( int i = 0; i < neighborWiseBoundries.length; i++ ) {
int maxDist = neighborWiseBoundries[i];
int count = variantsWithClusters.get(i).size();
int count = variantsWithClusters[i];
s.add(String.format("snps_within_clusters_of_size %10d %10d", maxDist, count));
}

View File

@ -27,7 +27,8 @@ import java.util.*;
*/
@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="eval",type=ReferenceOrderedDatum.class)}) // right now we have no base variant class for rods, this should change
@Allows(value={DataSource.REFERENCE},referenceMetaData = {@RMD(name="eval",type=ReferenceOrderedDatum.class), @RMD(name="dbsnp",type=rodDbSNP.class),@RMD(name="hapmap-chip",type=ReferenceOrderedDatum.class), @RMD(name="interval",type=IntervalRod.class), @RMD(name="validation",type=RodGenotypeChipAsGFF.class)})
public class VariantEvalWalker extends RefWalker<Integer, Integer> {
//public class VariantEvalWalker extends RefWalker<Integer, Integer> {
public class VariantEvalWalker extends RodWalker<Integer, Integer> {
@Argument(shortName="minConfidenceScore", doc="Minimum confidence score to consider an evaluation SNP a variant", required=false)
public int minConfidenceScore = -1;
@ -44,7 +45,7 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
public boolean explode = false;
@Argument(fullName="includeViolations", shortName = "V", doc="If provided, violations will be written out along with summary information", required=false)
public boolean includeViolations = false;
public boolean mIncludeViolations = false;
@Argument(fullName="extensiveSubsets", shortName = "A", doc="If provided, output will be calculated over a lot of subsets, by default we only operate over all variants", required=false)
public boolean extensiveSubsets = false;
@ -204,6 +205,16 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
nMappedSites += context.getSkippedBases();
//System.out.printf("Tracker at %s is %s, ref is %s%n", context.getLocation(), tracker, ref);
//if ( ref == null )
// out.printf("Last position was %s: skipping %d bases%n",
// context.getLocation(), context.getSkippedBases() );
if ( ref == null ) { // we are seeing the last site
return 0;
}
nMappedSites++;
int nBoundGoodRods = tracker.getNBoundRodTracks("interval");
@ -244,14 +255,16 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
return 1;
}
public boolean includeViolations() { return mIncludeViolations; }
public void updateAnalysisSet(final ANALYSIS_TYPE analysisSetName, Variation eval,
RefMetaDataTracker tracker, char ref, AlignmentContext context) {
// Iterate over each analysis, and update it
if (getAnalysisSet(analysisSetName) != null) {
for (VariantAnalysis analysis : getAnalysisSet(analysisSetName)) {
ArrayList<VariantAnalysis> set = getAnalysisSet(analysisSetName);
if ( set != null ) {
for ( VariantAnalysis analysis : set ) {
String s = analysis.update(eval, tracker, ref, context);
if (s != null && includeViolations) {
if ( s != null && includeViolations() ) {
analysis.getCallPrintStream().println(getLineHeader(analysisSetName, "flagged", analysis.getName()) + s);
}
}

View File

@ -6,6 +6,8 @@ import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.Utils;
import org.junit.Test;
import org.apache.log4j.Appender;
import org.apache.log4j.WriterAppender;
import java.io.File;
import java.io.FileInputStream;
@ -25,12 +27,13 @@ public class WalkerTest extends BaseTest {
String filemd5sum = bigInt.toString(16);
while (filemd5sum.length() < 32) filemd5sum = "0" + filemd5sum; // pad to length 32
if ( parameterize() || expectedMD5.equals("") ) {
logger.warn(String.format("PARAMETERIZATION[%s]: file %s has md5 = %s, stated expectation is %s, equal? = %b",
System.out.println(String.format("PARAMETERIZATION[%s]: file %s has md5 = %s, stated expectation is %s, equal? = %b",
name, resultsFile, filemd5sum, expectedMD5, filemd5sum.equals(expectedMD5)));
} else {
logger.warn(String.format("Checking MD5 for %s [calculated=%s, expected=%s]", resultsFile, filemd5sum, expectedMD5));
System.out.println(String.format("Checking MD5 for %s [calculated=%s, expected=%s]", resultsFile, filemd5sum, expectedMD5));
System.out.flush();
Assert.assertEquals(name + " Mismatching MD5s", expectedMD5, filemd5sum);
logger.warn(String.format(" => %s PASSED", name));
System.out.println(String.format(" => %s PASSED", name));
}
return filemd5sum;
@ -119,8 +122,8 @@ public class WalkerTest extends BaseTest {
}
final String args = String.format(spec.args, tmpFiles.toArray());
logger.warn(Utils.dupString('-', 80));
logger.warn(String.format("Executing test %s with GATK arguments: %s", name, args));
System.out.println(Utils.dupString('-', 80));
System.out.println(String.format("Executing test %s with GATK arguments: %s", name, args));
CommandLineGATK instance = new CommandLineGATK();
CommandLineExecutable.start(instance, args.split(" "));
@ -134,6 +137,6 @@ public class WalkerTest extends BaseTest {
@Test
public void testWalkerTest() {
//logger.warn("WalkerTest is just a framework");
//System.out.println("WalkerTest is just a framework");
}
}