Enabling correct high-performance ROD walker and moved VariantEval over to it. Performance improvements in variantEval in general. See wiki for full description
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1890 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4a8a6468be
commit
caa3187af8
|
|
@ -543,7 +543,7 @@ public class GenomeAnalysisEngine {
|
||||||
ShardStrategyFactory.SHATTER_STRATEGY shardType;
|
ShardStrategyFactory.SHATTER_STRATEGY shardType;
|
||||||
|
|
||||||
if (walker instanceof LocusWalker) {
|
if (walker instanceof LocusWalker) {
|
||||||
if (walker instanceof RodWalker) SHARD_SIZE *= 100;
|
if (walker instanceof RodWalker) SHARD_SIZE *= 1000;
|
||||||
|
|
||||||
if (intervals != null) {
|
if (intervals != null) {
|
||||||
shardType = (walker.isReduceByInterval()) ?
|
shardType = (walker.isReduceByInterval()) ?
|
||||||
|
|
|
||||||
|
|
@ -27,11 +27,6 @@ import net.sf.samtools.SAMRecord;
|
||||||
* A view into the reference-ordered data in the provider.
|
* A view into the reference-ordered data in the provider.
|
||||||
*/
|
*/
|
||||||
public class RodLocusView extends LocusView implements ReferenceOrderedView {
|
public class RodLocusView extends LocusView implements ReferenceOrderedView {
|
||||||
/**
|
|
||||||
* The provider that's supplying our backing data.
|
|
||||||
*/
|
|
||||||
//private final ShardDataProvider provider;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The data sources along with their current states.
|
* The data sources along with their current states.
|
||||||
*/
|
*/
|
||||||
|
|
@ -64,14 +59,19 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView {
|
||||||
List< Iterator<RODRecordList<ReferenceOrderedDatum>> > iterators = new LinkedList< Iterator<RODRecordList<ReferenceOrderedDatum>> >();
|
List< Iterator<RODRecordList<ReferenceOrderedDatum>> > iterators = new LinkedList< Iterator<RODRecordList<ReferenceOrderedDatum>> >();
|
||||||
for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() ) {
|
for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() ) {
|
||||||
if ( DEBUG ) System.out.printf("Shard is %s%n", loc);
|
if ( DEBUG ) System.out.printf("Shard is %s%n", loc);
|
||||||
|
|
||||||
|
// grab the ROD iterator from the data source, and compute the first location in this shard, forwarding
|
||||||
|
// the iterator to immediately before it, so that it can be added to the merging iterator primed for
|
||||||
|
// next() to return the first real ROD in this shard
|
||||||
SeekableRODIterator it = (SeekableRODIterator)dataSource.seek(provider.getShard());
|
SeekableRODIterator it = (SeekableRODIterator)dataSource.seek(provider.getShard());
|
||||||
RODRecordList<ReferenceOrderedDatum> x = it.seekForward(loc);
|
GenomeLoc shardLoc = provider.getShard().getGenomeLoc();
|
||||||
|
it.seekForward(GenomeLocParser.createGenomeLoc(shardLoc.getContigIndex(), shardLoc.getStart()-1, shardLoc.getStart()-1));
|
||||||
|
|
||||||
// we need to special case the interval so we don't always think there's a rod at the first location
|
// we need to special case the interval so we don't always think there's a rod at the first location
|
||||||
if ( dataSource.getName().equals(INTERVAL_ROD_NAME) ) {
|
if ( dataSource.getName().equals(INTERVAL_ROD_NAME) ) {
|
||||||
if ( interval != null )
|
if ( interval != null )
|
||||||
throw new RuntimeException("BUG: interval local variable already assigned " + interval);
|
throw new RuntimeException("BUG: interval local variable already assigned " + interval);
|
||||||
interval = x;
|
interval = (RODRecordList<ReferenceOrderedDatum>)it.next();
|
||||||
} else {
|
} else {
|
||||||
iterators.add( it );
|
iterators.add( it );
|
||||||
}
|
}
|
||||||
|
|
@ -142,30 +142,16 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView {
|
||||||
return rodQueue.allElementsLTE(marker);
|
return rodQueue.allElementsLTE(marker);
|
||||||
}
|
}
|
||||||
|
|
||||||
// private Collection<ReferenceOrderedDatum> getSpanningRods(ReferenceOrderedDatum marker) {
|
/**
|
||||||
// Collection<ReferenceOrderedDatum> allFromQueue = rodQueue.allElementsLTE(marker);
|
* Returns the number of reference bases that have been skipped:
|
||||||
// Collection<ReferenceOrderedDatum> all = new LinkedList<ReferenceOrderedDatum>(allFromQueue);
|
*
|
||||||
//
|
* 1 -- since the last processed location if we have one
|
||||||
// Iterator<ReferenceOrderedDatum> it = multiLocusRODs.iterator();
|
* 2 -- from the beginning of the shard if this is the first loc
|
||||||
// while ( it.hasNext() ) {
|
* 3 -- from the last location to the current position
|
||||||
// ReferenceOrderedDatum mlr = it.next();
|
*
|
||||||
// if ( mlr.getLocation().overlapsP(marker.getLocation()) ) {
|
* @param currentPos
|
||||||
// all.add(mlr);
|
* @return
|
||||||
// } else {
|
*/
|
||||||
// it.remove(); // no long covering this site
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// for ( ReferenceOrderedDatum datum : allFromQueue ) {
|
|
||||||
// if ( ! datum.getLocation().isSingleBP() ) {
|
|
||||||
// System.out.printf("Adding multi-locus %s%n", datum);
|
|
||||||
// multiLocusRODs.add(datum);
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// return all;
|
|
||||||
// }
|
|
||||||
|
|
||||||
private long getSkippedBases( GenomeLoc currentPos ) {
|
private long getSkippedBases( GenomeLoc currentPos ) {
|
||||||
long skippedBases = 0;
|
long skippedBases = 0;
|
||||||
|
|
||||||
|
|
@ -179,7 +165,8 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView {
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( skippedBases < -1 ) { // minus 1 value is ok
|
if ( skippedBases < -1 ) { // minus 1 value is ok
|
||||||
throw new RuntimeException(String.format("BUG: skipped bases is < 0: cur=%s vs. last=%s", currentPos, lastLoc));
|
throw new RuntimeException(String.format("BUG: skipped bases=%d is < 0: cur=%s vs. last=%s, shard=%s",
|
||||||
|
skippedBases, currentPos, lastLoc, shard.getGenomeLoc()));
|
||||||
}
|
}
|
||||||
return Math.max(skippedBases, 0);
|
return Math.max(skippedBases, 0);
|
||||||
}
|
}
|
||||||
|
|
@ -215,8 +202,6 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView {
|
||||||
* Closes the current view.
|
* Closes the current view.
|
||||||
*/
|
*/
|
||||||
public void close() {
|
public void close() {
|
||||||
//rodQueue.close();
|
|
||||||
|
|
||||||
rodQueue = null;
|
rodQueue = null;
|
||||||
tracker = null;
|
tracker = null;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -49,9 +49,8 @@ public class TraverseLoci extends TraversalEngine {
|
||||||
|
|
||||||
LocusView locusView = getLocusView( walker, dataProvider );
|
LocusView locusView = getLocusView( walker, dataProvider );
|
||||||
|
|
||||||
|
//if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA )
|
||||||
if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA )
|
// throw new RuntimeException("Engine currently doesn't support RodWalkers");
|
||||||
throw new RuntimeException("Engine currently doesn't support RodWalkers");
|
|
||||||
|
|
||||||
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
|
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -21,19 +21,29 @@ import java.util.List;
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class ClusterCounterAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
|
public class ClusterCounterAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
|
||||||
ArrayList<HashSet<GenomeLoc>> variantsWithClusters;
|
|
||||||
int minDistanceForFlagging = 5;
|
int minDistanceForFlagging = 5;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Threshold distances between neighboring SNPs we will track and assess
|
||||||
|
*/
|
||||||
int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100};
|
int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100};
|
||||||
|
int[] variantsWithClusters = new int[neighborWiseBoundries.length];
|
||||||
|
/**
|
||||||
|
* Keep track of the last variation we saw in the stream
|
||||||
|
*/
|
||||||
Variation lastVariation = null;
|
Variation lastVariation = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The interval that the last variation occurred in. Don't think that SNPs from different intervals are
|
||||||
|
* too close together in hybrid selection.
|
||||||
|
*/
|
||||||
GenomeLoc lastVariantInterval = null;
|
GenomeLoc lastVariantInterval = null;
|
||||||
int nSeen = 0;
|
int nSeen = 0;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public ClusterCounterAnalysis() {
|
public ClusterCounterAnalysis() {
|
||||||
super("cluster_counter_analysis");
|
super("cluster_counter_analysis");
|
||||||
|
|
||||||
variantsWithClusters = new ArrayList<HashSet<GenomeLoc>>(neighborWiseBoundries.length);
|
|
||||||
for (int i = 0; i < neighborWiseBoundries.length; i++)
|
|
||||||
variantsWithClusters.add(new HashSet<GenomeLoc>());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||||
|
|
@ -46,26 +56,27 @@ public class ClusterCounterAnalysis extends BasicVariantAnalysis implements Geno
|
||||||
if (lastVariation != null) {
|
if (lastVariation != null) {
|
||||||
GenomeLoc eL = eval.getLocation();
|
GenomeLoc eL = eval.getLocation();
|
||||||
GenomeLoc lvL = lastVariation.getLocation();
|
GenomeLoc lvL = lastVariation.getLocation();
|
||||||
if (eL.getContigIndex() == lvL.getContigIndex()) {
|
|
||||||
|
// if we are on the same contig, and we in the same interval
|
||||||
|
if (eL.getContigIndex() == lvL.getContigIndex() && ! (lastVariantInterval != null && lastVariantInterval.compareTo(interval) != 0)) {
|
||||||
long d = eL.distance(lvL);
|
long d = eL.distance(lvL);
|
||||||
if ( lastVariantInterval != null && lastVariantInterval.compareTo(interval) != 0) {
|
nSeen++;
|
||||||
// we're on different intervals
|
StringBuilder s = new StringBuilder();
|
||||||
//out.printf("# Excluding %d %s %s vs. %s %s%n", d, eL, interval, lvL, lastVariantInterval);
|
for (int i = 0; i < neighborWiseBoundries.length; i++) {
|
||||||
} else {
|
int maxDist = neighborWiseBoundries[i];
|
||||||
nSeen++;
|
s.append(String.format("%d ", d <= maxDist ? maxDist : 0));
|
||||||
StringBuilder s = new StringBuilder();
|
if ( d <= maxDist ) {
|
||||||
for (int i = 0; i < neighborWiseBoundries.length; i++) {
|
variantsWithClusters[i]++;
|
||||||
int maxDist = neighborWiseBoundries[i];
|
|
||||||
s.append(String.format("%d ", d <= maxDist ? maxDist : 0));
|
|
||||||
if ( d <= maxDist ) {
|
|
||||||
variantsWithClusters.get(i).add(eL);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
r = d <= minDistanceForFlagging ? String.format("snp_within_cluster %d %s %s %s", d, eL, lvL, s.toString()) : null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// lookup in master for performance reasons
|
||||||
|
if ( d <= minDistanceForFlagging && getMaster().includeViolations() )
|
||||||
|
r = String.format("snp_within_cluster %d %s %s %s", d, eL, lvL, s.toString());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// eval is now the last variation
|
||||||
lastVariation = eval;
|
lastVariation = eval;
|
||||||
lastVariantInterval = interval;
|
lastVariantInterval = interval;
|
||||||
}
|
}
|
||||||
|
|
@ -79,7 +90,7 @@ public class ClusterCounterAnalysis extends BasicVariantAnalysis implements Geno
|
||||||
s.add(String.format("description maxDist count"));
|
s.add(String.format("description maxDist count"));
|
||||||
for ( int i = 0; i < neighborWiseBoundries.length; i++ ) {
|
for ( int i = 0; i < neighborWiseBoundries.length; i++ ) {
|
||||||
int maxDist = neighborWiseBoundries[i];
|
int maxDist = neighborWiseBoundries[i];
|
||||||
int count = variantsWithClusters.get(i).size();
|
int count = variantsWithClusters[i];
|
||||||
s.add(String.format("snps_within_clusters_of_size %10d %10d", maxDist, count));
|
s.add(String.format("snps_within_clusters_of_size %10d %10d", maxDist, count));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -27,7 +27,8 @@ import java.util.*;
|
||||||
*/
|
*/
|
||||||
@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="eval",type=ReferenceOrderedDatum.class)}) // right now we have no base variant class for rods, this should change
|
@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="eval",type=ReferenceOrderedDatum.class)}) // right now we have no base variant class for rods, this should change
|
||||||
@Allows(value={DataSource.REFERENCE},referenceMetaData = {@RMD(name="eval",type=ReferenceOrderedDatum.class), @RMD(name="dbsnp",type=rodDbSNP.class),@RMD(name="hapmap-chip",type=ReferenceOrderedDatum.class), @RMD(name="interval",type=IntervalRod.class), @RMD(name="validation",type=RodGenotypeChipAsGFF.class)})
|
@Allows(value={DataSource.REFERENCE},referenceMetaData = {@RMD(name="eval",type=ReferenceOrderedDatum.class), @RMD(name="dbsnp",type=rodDbSNP.class),@RMD(name="hapmap-chip",type=ReferenceOrderedDatum.class), @RMD(name="interval",type=IntervalRod.class), @RMD(name="validation",type=RodGenotypeChipAsGFF.class)})
|
||||||
public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
//public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
||||||
|
public class VariantEvalWalker extends RodWalker<Integer, Integer> {
|
||||||
@Argument(shortName="minConfidenceScore", doc="Minimum confidence score to consider an evaluation SNP a variant", required=false)
|
@Argument(shortName="minConfidenceScore", doc="Minimum confidence score to consider an evaluation SNP a variant", required=false)
|
||||||
public int minConfidenceScore = -1;
|
public int minConfidenceScore = -1;
|
||||||
|
|
||||||
|
|
@ -44,7 +45,7 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
||||||
public boolean explode = false;
|
public boolean explode = false;
|
||||||
|
|
||||||
@Argument(fullName="includeViolations", shortName = "V", doc="If provided, violations will be written out along with summary information", required=false)
|
@Argument(fullName="includeViolations", shortName = "V", doc="If provided, violations will be written out along with summary information", required=false)
|
||||||
public boolean includeViolations = false;
|
public boolean mIncludeViolations = false;
|
||||||
|
|
||||||
@Argument(fullName="extensiveSubsets", shortName = "A", doc="If provided, output will be calculated over a lot of subsets, by default we only operate over all variants", required=false)
|
@Argument(fullName="extensiveSubsets", shortName = "A", doc="If provided, output will be calculated over a lot of subsets, by default we only operate over all variants", required=false)
|
||||||
public boolean extensiveSubsets = false;
|
public boolean extensiveSubsets = false;
|
||||||
|
|
@ -204,6 +205,16 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
||||||
}
|
}
|
||||||
|
|
||||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
nMappedSites += context.getSkippedBases();
|
||||||
|
|
||||||
|
//System.out.printf("Tracker at %s is %s, ref is %s%n", context.getLocation(), tracker, ref);
|
||||||
|
//if ( ref == null )
|
||||||
|
// out.printf("Last position was %s: skipping %d bases%n",
|
||||||
|
// context.getLocation(), context.getSkippedBases() );
|
||||||
|
if ( ref == null ) { // we are seeing the last site
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
nMappedSites++;
|
nMappedSites++;
|
||||||
|
|
||||||
int nBoundGoodRods = tracker.getNBoundRodTracks("interval");
|
int nBoundGoodRods = tracker.getNBoundRodTracks("interval");
|
||||||
|
|
@ -244,14 +255,16 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean includeViolations() { return mIncludeViolations; }
|
||||||
|
|
||||||
public void updateAnalysisSet(final ANALYSIS_TYPE analysisSetName, Variation eval,
|
public void updateAnalysisSet(final ANALYSIS_TYPE analysisSetName, Variation eval,
|
||||||
RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||||
// Iterate over each analysis, and update it
|
// Iterate over each analysis, and update it
|
||||||
if (getAnalysisSet(analysisSetName) != null) {
|
ArrayList<VariantAnalysis> set = getAnalysisSet(analysisSetName);
|
||||||
for (VariantAnalysis analysis : getAnalysisSet(analysisSetName)) {
|
if ( set != null ) {
|
||||||
|
for ( VariantAnalysis analysis : set ) {
|
||||||
String s = analysis.update(eval, tracker, ref, context);
|
String s = analysis.update(eval, tracker, ref, context);
|
||||||
if (s != null && includeViolations) {
|
if ( s != null && includeViolations() ) {
|
||||||
analysis.getCallPrintStream().println(getLineHeader(analysisSetName, "flagged", analysis.getName()) + s);
|
analysis.getCallPrintStream().println(getLineHeader(analysisSetName, "flagged", analysis.getName()) + s);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,8 @@ import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||||
import org.broadinstitute.sting.utils.Pair;
|
import org.broadinstitute.sting.utils.Pair;
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
import org.apache.log4j.Appender;
|
||||||
|
import org.apache.log4j.WriterAppender;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
|
|
@ -25,12 +27,13 @@ public class WalkerTest extends BaseTest {
|
||||||
String filemd5sum = bigInt.toString(16);
|
String filemd5sum = bigInt.toString(16);
|
||||||
while (filemd5sum.length() < 32) filemd5sum = "0" + filemd5sum; // pad to length 32
|
while (filemd5sum.length() < 32) filemd5sum = "0" + filemd5sum; // pad to length 32
|
||||||
if ( parameterize() || expectedMD5.equals("") ) {
|
if ( parameterize() || expectedMD5.equals("") ) {
|
||||||
logger.warn(String.format("PARAMETERIZATION[%s]: file %s has md5 = %s, stated expectation is %s, equal? = %b",
|
System.out.println(String.format("PARAMETERIZATION[%s]: file %s has md5 = %s, stated expectation is %s, equal? = %b",
|
||||||
name, resultsFile, filemd5sum, expectedMD5, filemd5sum.equals(expectedMD5)));
|
name, resultsFile, filemd5sum, expectedMD5, filemd5sum.equals(expectedMD5)));
|
||||||
} else {
|
} else {
|
||||||
logger.warn(String.format("Checking MD5 for %s [calculated=%s, expected=%s]", resultsFile, filemd5sum, expectedMD5));
|
System.out.println(String.format("Checking MD5 for %s [calculated=%s, expected=%s]", resultsFile, filemd5sum, expectedMD5));
|
||||||
|
System.out.flush();
|
||||||
Assert.assertEquals(name + " Mismatching MD5s", expectedMD5, filemd5sum);
|
Assert.assertEquals(name + " Mismatching MD5s", expectedMD5, filemd5sum);
|
||||||
logger.warn(String.format(" => %s PASSED", name));
|
System.out.println(String.format(" => %s PASSED", name));
|
||||||
}
|
}
|
||||||
|
|
||||||
return filemd5sum;
|
return filemd5sum;
|
||||||
|
|
@ -119,8 +122,8 @@ public class WalkerTest extends BaseTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
final String args = String.format(spec.args, tmpFiles.toArray());
|
final String args = String.format(spec.args, tmpFiles.toArray());
|
||||||
logger.warn(Utils.dupString('-', 80));
|
System.out.println(Utils.dupString('-', 80));
|
||||||
logger.warn(String.format("Executing test %s with GATK arguments: %s", name, args));
|
System.out.println(String.format("Executing test %s with GATK arguments: %s", name, args));
|
||||||
|
|
||||||
CommandLineGATK instance = new CommandLineGATK();
|
CommandLineGATK instance = new CommandLineGATK();
|
||||||
CommandLineExecutable.start(instance, args.split(" "));
|
CommandLineExecutable.start(instance, args.split(" "));
|
||||||
|
|
@ -134,6 +137,6 @@ public class WalkerTest extends BaseTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testWalkerTest() {
|
public void testWalkerTest() {
|
||||||
//logger.warn("WalkerTest is just a framework");
|
//System.out.println("WalkerTest is just a framework");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Loading…
Reference in New Issue