Enabling correct high-performance ROD walker and moved VariantEval over to it. Performance improvements in variantEval in general. See wiki for full description
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1890 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4a8a6468be
commit
caa3187af8
|
|
@ -543,7 +543,7 @@ public class GenomeAnalysisEngine {
|
|||
ShardStrategyFactory.SHATTER_STRATEGY shardType;
|
||||
|
||||
if (walker instanceof LocusWalker) {
|
||||
if (walker instanceof RodWalker) SHARD_SIZE *= 100;
|
||||
if (walker instanceof RodWalker) SHARD_SIZE *= 1000;
|
||||
|
||||
if (intervals != null) {
|
||||
shardType = (walker.isReduceByInterval()) ?
|
||||
|
|
|
|||
|
|
@ -27,11 +27,6 @@ import net.sf.samtools.SAMRecord;
|
|||
* A view into the reference-ordered data in the provider.
|
||||
*/
|
||||
public class RodLocusView extends LocusView implements ReferenceOrderedView {
|
||||
/**
|
||||
* The provider that's supplying our backing data.
|
||||
*/
|
||||
//private final ShardDataProvider provider;
|
||||
|
||||
/**
|
||||
* The data sources along with their current states.
|
||||
*/
|
||||
|
|
@ -64,14 +59,19 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView {
|
|||
List< Iterator<RODRecordList<ReferenceOrderedDatum>> > iterators = new LinkedList< Iterator<RODRecordList<ReferenceOrderedDatum>> >();
|
||||
for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() ) {
|
||||
if ( DEBUG ) System.out.printf("Shard is %s%n", loc);
|
||||
|
||||
// grab the ROD iterator from the data source, and compute the first location in this shard, forwarding
|
||||
// the iterator to immediately before it, so that it can be added to the merging iterator primed for
|
||||
// next() to return the first real ROD in this shard
|
||||
SeekableRODIterator it = (SeekableRODIterator)dataSource.seek(provider.getShard());
|
||||
RODRecordList<ReferenceOrderedDatum> x = it.seekForward(loc);
|
||||
GenomeLoc shardLoc = provider.getShard().getGenomeLoc();
|
||||
it.seekForward(GenomeLocParser.createGenomeLoc(shardLoc.getContigIndex(), shardLoc.getStart()-1, shardLoc.getStart()-1));
|
||||
|
||||
// we need to special case the interval so we don't always think there's a rod at the first location
|
||||
if ( dataSource.getName().equals(INTERVAL_ROD_NAME) ) {
|
||||
if ( interval != null )
|
||||
throw new RuntimeException("BUG: interval local variable already assigned " + interval);
|
||||
interval = x;
|
||||
interval = (RODRecordList<ReferenceOrderedDatum>)it.next();
|
||||
} else {
|
||||
iterators.add( it );
|
||||
}
|
||||
|
|
@ -142,30 +142,16 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView {
|
|||
return rodQueue.allElementsLTE(marker);
|
||||
}
|
||||
|
||||
// private Collection<ReferenceOrderedDatum> getSpanningRods(ReferenceOrderedDatum marker) {
|
||||
// Collection<ReferenceOrderedDatum> allFromQueue = rodQueue.allElementsLTE(marker);
|
||||
// Collection<ReferenceOrderedDatum> all = new LinkedList<ReferenceOrderedDatum>(allFromQueue);
|
||||
//
|
||||
// Iterator<ReferenceOrderedDatum> it = multiLocusRODs.iterator();
|
||||
// while ( it.hasNext() ) {
|
||||
// ReferenceOrderedDatum mlr = it.next();
|
||||
// if ( mlr.getLocation().overlapsP(marker.getLocation()) ) {
|
||||
// all.add(mlr);
|
||||
// } else {
|
||||
// it.remove(); // no long covering this site
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// for ( ReferenceOrderedDatum datum : allFromQueue ) {
|
||||
// if ( ! datum.getLocation().isSingleBP() ) {
|
||||
// System.out.printf("Adding multi-locus %s%n", datum);
|
||||
// multiLocusRODs.add(datum);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// return all;
|
||||
// }
|
||||
|
||||
/**
|
||||
* Returns the number of reference bases that have been skipped:
|
||||
*
|
||||
* 1 -- since the last processed location if we have one
|
||||
* 2 -- from the beginning of the shard if this is the first loc
|
||||
* 3 -- from the last location to the current position
|
||||
*
|
||||
* @param currentPos
|
||||
* @return
|
||||
*/
|
||||
private long getSkippedBases( GenomeLoc currentPos ) {
|
||||
long skippedBases = 0;
|
||||
|
||||
|
|
@ -179,7 +165,8 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView {
|
|||
}
|
||||
|
||||
if ( skippedBases < -1 ) { // minus 1 value is ok
|
||||
throw new RuntimeException(String.format("BUG: skipped bases is < 0: cur=%s vs. last=%s", currentPos, lastLoc));
|
||||
throw new RuntimeException(String.format("BUG: skipped bases=%d is < 0: cur=%s vs. last=%s, shard=%s",
|
||||
skippedBases, currentPos, lastLoc, shard.getGenomeLoc()));
|
||||
}
|
||||
return Math.max(skippedBases, 0);
|
||||
}
|
||||
|
|
@ -215,8 +202,6 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView {
|
|||
* Closes the current view.
|
||||
*/
|
||||
public void close() {
|
||||
//rodQueue.close();
|
||||
|
||||
rodQueue = null;
|
||||
tracker = null;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -49,9 +49,8 @@ public class TraverseLoci extends TraversalEngine {
|
|||
|
||||
LocusView locusView = getLocusView( walker, dataProvider );
|
||||
|
||||
|
||||
if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA )
|
||||
throw new RuntimeException("Engine currently doesn't support RodWalkers");
|
||||
//if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA )
|
||||
// throw new RuntimeException("Engine currently doesn't support RodWalkers");
|
||||
|
||||
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
|
||||
|
||||
|
|
|
|||
|
|
@ -21,19 +21,29 @@ import java.util.List;
|
|||
*
|
||||
*/
|
||||
public class ClusterCounterAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
|
||||
ArrayList<HashSet<GenomeLoc>> variantsWithClusters;
|
||||
int minDistanceForFlagging = 5;
|
||||
|
||||
/**
|
||||
* Threshold distances between neighboring SNPs we will track and assess
|
||||
*/
|
||||
int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100};
|
||||
int[] variantsWithClusters = new int[neighborWiseBoundries.length];
|
||||
/**
|
||||
* Keep track of the last variation we saw in the stream
|
||||
*/
|
||||
Variation lastVariation = null;
|
||||
|
||||
/**
|
||||
* The interval that the last variation occurred in. Don't think that SNPs from different intervals are
|
||||
* too close together in hybrid selection.
|
||||
*/
|
||||
GenomeLoc lastVariantInterval = null;
|
||||
int nSeen = 0;
|
||||
|
||||
|
||||
|
||||
public ClusterCounterAnalysis() {
|
||||
super("cluster_counter_analysis");
|
||||
|
||||
variantsWithClusters = new ArrayList<HashSet<GenomeLoc>>(neighborWiseBoundries.length);
|
||||
for (int i = 0; i < neighborWiseBoundries.length; i++)
|
||||
variantsWithClusters.add(new HashSet<GenomeLoc>());
|
||||
}
|
||||
|
||||
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||
|
|
@ -46,26 +56,27 @@ public class ClusterCounterAnalysis extends BasicVariantAnalysis implements Geno
|
|||
if (lastVariation != null) {
|
||||
GenomeLoc eL = eval.getLocation();
|
||||
GenomeLoc lvL = lastVariation.getLocation();
|
||||
if (eL.getContigIndex() == lvL.getContigIndex()) {
|
||||
|
||||
// if we are on the same contig, and we in the same interval
|
||||
if (eL.getContigIndex() == lvL.getContigIndex() && ! (lastVariantInterval != null && lastVariantInterval.compareTo(interval) != 0)) {
|
||||
long d = eL.distance(lvL);
|
||||
if ( lastVariantInterval != null && lastVariantInterval.compareTo(interval) != 0) {
|
||||
// we're on different intervals
|
||||
//out.printf("# Excluding %d %s %s vs. %s %s%n", d, eL, interval, lvL, lastVariantInterval);
|
||||
} else {
|
||||
nSeen++;
|
||||
StringBuilder s = new StringBuilder();
|
||||
for (int i = 0; i < neighborWiseBoundries.length; i++) {
|
||||
int maxDist = neighborWiseBoundries[i];
|
||||
s.append(String.format("%d ", d <= maxDist ? maxDist : 0));
|
||||
if ( d <= maxDist ) {
|
||||
variantsWithClusters.get(i).add(eL);
|
||||
}
|
||||
nSeen++;
|
||||
StringBuilder s = new StringBuilder();
|
||||
for (int i = 0; i < neighborWiseBoundries.length; i++) {
|
||||
int maxDist = neighborWiseBoundries[i];
|
||||
s.append(String.format("%d ", d <= maxDist ? maxDist : 0));
|
||||
if ( d <= maxDist ) {
|
||||
variantsWithClusters[i]++;
|
||||
}
|
||||
r = d <= minDistanceForFlagging ? String.format("snp_within_cluster %d %s %s %s", d, eL, lvL, s.toString()) : null;
|
||||
}
|
||||
|
||||
// lookup in master for performance reasons
|
||||
if ( d <= minDistanceForFlagging && getMaster().includeViolations() )
|
||||
r = String.format("snp_within_cluster %d %s %s %s", d, eL, lvL, s.toString());
|
||||
}
|
||||
}
|
||||
|
||||
// eval is now the last variation
|
||||
lastVariation = eval;
|
||||
lastVariantInterval = interval;
|
||||
}
|
||||
|
|
@ -79,7 +90,7 @@ public class ClusterCounterAnalysis extends BasicVariantAnalysis implements Geno
|
|||
s.add(String.format("description maxDist count"));
|
||||
for ( int i = 0; i < neighborWiseBoundries.length; i++ ) {
|
||||
int maxDist = neighborWiseBoundries[i];
|
||||
int count = variantsWithClusters.get(i).size();
|
||||
int count = variantsWithClusters[i];
|
||||
s.add(String.format("snps_within_clusters_of_size %10d %10d", maxDist, count));
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -27,7 +27,8 @@ import java.util.*;
|
|||
*/
|
||||
@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="eval",type=ReferenceOrderedDatum.class)}) // right now we have no base variant class for rods, this should change
|
||||
@Allows(value={DataSource.REFERENCE},referenceMetaData = {@RMD(name="eval",type=ReferenceOrderedDatum.class), @RMD(name="dbsnp",type=rodDbSNP.class),@RMD(name="hapmap-chip",type=ReferenceOrderedDatum.class), @RMD(name="interval",type=IntervalRod.class), @RMD(name="validation",type=RodGenotypeChipAsGFF.class)})
|
||||
public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
||||
//public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
||||
public class VariantEvalWalker extends RodWalker<Integer, Integer> {
|
||||
@Argument(shortName="minConfidenceScore", doc="Minimum confidence score to consider an evaluation SNP a variant", required=false)
|
||||
public int minConfidenceScore = -1;
|
||||
|
||||
|
|
@ -44,7 +45,7 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
|||
public boolean explode = false;
|
||||
|
||||
@Argument(fullName="includeViolations", shortName = "V", doc="If provided, violations will be written out along with summary information", required=false)
|
||||
public boolean includeViolations = false;
|
||||
public boolean mIncludeViolations = false;
|
||||
|
||||
@Argument(fullName="extensiveSubsets", shortName = "A", doc="If provided, output will be calculated over a lot of subsets, by default we only operate over all variants", required=false)
|
||||
public boolean extensiveSubsets = false;
|
||||
|
|
@ -204,6 +205,16 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
nMappedSites += context.getSkippedBases();
|
||||
|
||||
//System.out.printf("Tracker at %s is %s, ref is %s%n", context.getLocation(), tracker, ref);
|
||||
//if ( ref == null )
|
||||
// out.printf("Last position was %s: skipping %d bases%n",
|
||||
// context.getLocation(), context.getSkippedBases() );
|
||||
if ( ref == null ) { // we are seeing the last site
|
||||
return 0;
|
||||
}
|
||||
|
||||
nMappedSites++;
|
||||
|
||||
int nBoundGoodRods = tracker.getNBoundRodTracks("interval");
|
||||
|
|
@ -244,14 +255,16 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
|||
return 1;
|
||||
}
|
||||
|
||||
public boolean includeViolations() { return mIncludeViolations; }
|
||||
|
||||
public void updateAnalysisSet(final ANALYSIS_TYPE analysisSetName, Variation eval,
|
||||
RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||
// Iterate over each analysis, and update it
|
||||
if (getAnalysisSet(analysisSetName) != null) {
|
||||
for (VariantAnalysis analysis : getAnalysisSet(analysisSetName)) {
|
||||
ArrayList<VariantAnalysis> set = getAnalysisSet(analysisSetName);
|
||||
if ( set != null ) {
|
||||
for ( VariantAnalysis analysis : set ) {
|
||||
String s = analysis.update(eval, tracker, ref, context);
|
||||
if (s != null && includeViolations) {
|
||||
if ( s != null && includeViolations() ) {
|
||||
analysis.getCallPrintStream().println(getLineHeader(analysisSetName, "flagged", analysis.getName()) + s);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,6 +6,8 @@ import org.broadinstitute.sting.gatk.CommandLineGATK;
|
|||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.junit.Test;
|
||||
import org.apache.log4j.Appender;
|
||||
import org.apache.log4j.WriterAppender;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
|
|
@ -25,12 +27,13 @@ public class WalkerTest extends BaseTest {
|
|||
String filemd5sum = bigInt.toString(16);
|
||||
while (filemd5sum.length() < 32) filemd5sum = "0" + filemd5sum; // pad to length 32
|
||||
if ( parameterize() || expectedMD5.equals("") ) {
|
||||
logger.warn(String.format("PARAMETERIZATION[%s]: file %s has md5 = %s, stated expectation is %s, equal? = %b",
|
||||
System.out.println(String.format("PARAMETERIZATION[%s]: file %s has md5 = %s, stated expectation is %s, equal? = %b",
|
||||
name, resultsFile, filemd5sum, expectedMD5, filemd5sum.equals(expectedMD5)));
|
||||
} else {
|
||||
logger.warn(String.format("Checking MD5 for %s [calculated=%s, expected=%s]", resultsFile, filemd5sum, expectedMD5));
|
||||
System.out.println(String.format("Checking MD5 for %s [calculated=%s, expected=%s]", resultsFile, filemd5sum, expectedMD5));
|
||||
System.out.flush();
|
||||
Assert.assertEquals(name + " Mismatching MD5s", expectedMD5, filemd5sum);
|
||||
logger.warn(String.format(" => %s PASSED", name));
|
||||
System.out.println(String.format(" => %s PASSED", name));
|
||||
}
|
||||
|
||||
return filemd5sum;
|
||||
|
|
@ -119,8 +122,8 @@ public class WalkerTest extends BaseTest {
|
|||
}
|
||||
|
||||
final String args = String.format(spec.args, tmpFiles.toArray());
|
||||
logger.warn(Utils.dupString('-', 80));
|
||||
logger.warn(String.format("Executing test %s with GATK arguments: %s", name, args));
|
||||
System.out.println(Utils.dupString('-', 80));
|
||||
System.out.println(String.format("Executing test %s with GATK arguments: %s", name, args));
|
||||
|
||||
CommandLineGATK instance = new CommandLineGATK();
|
||||
CommandLineExecutable.start(instance, args.split(" "));
|
||||
|
|
@ -134,6 +137,6 @@ public class WalkerTest extends BaseTest {
|
|||
|
||||
@Test
|
||||
public void testWalkerTest() {
|
||||
//logger.warn("WalkerTest is just a framework");
|
||||
//System.out.println("WalkerTest is just a framework");
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue