diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 7c0ef7980..99568e3ca 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -543,7 +543,7 @@ public class GenomeAnalysisEngine { ShardStrategyFactory.SHATTER_STRATEGY shardType; if (walker instanceof LocusWalker) { - if (walker instanceof RodWalker) SHARD_SIZE *= 100; + if (walker instanceof RodWalker) SHARD_SIZE *= 1000; if (intervals != null) { shardType = (walker.isReduceByInterval()) ? diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java b/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java index 589a72eec..e5a8831b4 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java @@ -27,11 +27,6 @@ import net.sf.samtools.SAMRecord; * A view into the reference-ordered data in the provider. */ public class RodLocusView extends LocusView implements ReferenceOrderedView { - /** - * The provider that's supplying our backing data. - */ - //private final ShardDataProvider provider; - /** * The data sources along with their current states. */ @@ -64,14 +59,19 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView { List< Iterator> > iterators = new LinkedList< Iterator> >(); for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() ) { if ( DEBUG ) System.out.printf("Shard is %s%n", loc); + + // grab the ROD iterator from the data source, and compute the first location in this shard, forwarding + // the iterator to immediately before it, so that it can be added to the merging iterator primed for + // next() to return the first real ROD in this shard SeekableRODIterator it = (SeekableRODIterator)dataSource.seek(provider.getShard()); - RODRecordList x = it.seekForward(loc); + GenomeLoc shardLoc = provider.getShard().getGenomeLoc(); + it.seekForward(GenomeLocParser.createGenomeLoc(shardLoc.getContigIndex(), shardLoc.getStart()-1, shardLoc.getStart()-1)); // we need to special case the interval so we don't always think there's a rod at the first location if ( dataSource.getName().equals(INTERVAL_ROD_NAME) ) { if ( interval != null ) throw new RuntimeException("BUG: interval local variable already assigned " + interval); - interval = x; + interval = (RODRecordList)it.next(); } else { iterators.add( it ); } @@ -142,30 +142,16 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView { return rodQueue.allElementsLTE(marker); } -// private Collection getSpanningRods(ReferenceOrderedDatum marker) { -// Collection allFromQueue = rodQueue.allElementsLTE(marker); -// Collection all = new LinkedList(allFromQueue); -// -// Iterator it = multiLocusRODs.iterator(); -// while ( it.hasNext() ) { -// ReferenceOrderedDatum mlr = it.next(); -// if ( mlr.getLocation().overlapsP(marker.getLocation()) ) { -// all.add(mlr); -// } else { -// it.remove(); // no long covering this site -// } -// } -// -// for ( ReferenceOrderedDatum datum : allFromQueue ) { -// if ( ! datum.getLocation().isSingleBP() ) { -// System.out.printf("Adding multi-locus %s%n", datum); -// multiLocusRODs.add(datum); -// } -// } -// -// return all; -// } - + /** + * Returns the number of reference bases that have been skipped: + * + * 1 -- since the last processed location if we have one + * 2 -- from the beginning of the shard if this is the first loc + * 3 -- from the last location to the current position + * + * @param currentPos + * @return + */ private long getSkippedBases( GenomeLoc currentPos ) { long skippedBases = 0; @@ -179,7 +165,8 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView { } if ( skippedBases < -1 ) { // minus 1 value is ok - throw new RuntimeException(String.format("BUG: skipped bases is < 0: cur=%s vs. last=%s", currentPos, lastLoc)); + throw new RuntimeException(String.format("BUG: skipped bases=%d is < 0: cur=%s vs. last=%s, shard=%s", + skippedBases, currentPos, lastLoc, shard.getGenomeLoc())); } return Math.max(skippedBases, 0); } @@ -215,8 +202,6 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView { * Closes the current view. */ public void close() { - //rodQueue.close(); - rodQueue = null; tracker = null; } diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java index 646f0aebd..01614cbcb 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java @@ -49,9 +49,8 @@ public class TraverseLoci extends TraversalEngine { LocusView locusView = getLocusView( walker, dataProvider ); - - if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA ) - throw new RuntimeException("Engine currently doesn't support RodWalkers"); + //if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA ) + // throw new RuntimeException("Engine currently doesn't support RodWalkers"); if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/ClusterCounterAnalysis.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/ClusterCounterAnalysis.java index bf965a813..d7c717072 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/ClusterCounterAnalysis.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/ClusterCounterAnalysis.java @@ -21,19 +21,29 @@ import java.util.List; * */ public class ClusterCounterAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis { - ArrayList> variantsWithClusters; int minDistanceForFlagging = 5; + + /** + * Threshold distances between neighboring SNPs we will track and assess + */ int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100}; + int[] variantsWithClusters = new int[neighborWiseBoundries.length]; + /** + * Keep track of the last variation we saw in the stream + */ Variation lastVariation = null; + + /** + * The interval that the last variation occurred in. Don't think that SNPs from different intervals are + * too close together in hybrid selection. + */ GenomeLoc lastVariantInterval = null; int nSeen = 0; + + public ClusterCounterAnalysis() { super("cluster_counter_analysis"); - - variantsWithClusters = new ArrayList>(neighborWiseBoundries.length); - for (int i = 0; i < neighborWiseBoundries.length; i++) - variantsWithClusters.add(new HashSet()); } public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) { @@ -46,26 +56,27 @@ public class ClusterCounterAnalysis extends BasicVariantAnalysis implements Geno if (lastVariation != null) { GenomeLoc eL = eval.getLocation(); GenomeLoc lvL = lastVariation.getLocation(); - if (eL.getContigIndex() == lvL.getContigIndex()) { + + // if we are on the same contig, and we in the same interval + if (eL.getContigIndex() == lvL.getContigIndex() && ! (lastVariantInterval != null && lastVariantInterval.compareTo(interval) != 0)) { long d = eL.distance(lvL); - if ( lastVariantInterval != null && lastVariantInterval.compareTo(interval) != 0) { - // we're on different intervals - //out.printf("# Excluding %d %s %s vs. %s %s%n", d, eL, interval, lvL, lastVariantInterval); - } else { - nSeen++; - StringBuilder s = new StringBuilder(); - for (int i = 0; i < neighborWiseBoundries.length; i++) { - int maxDist = neighborWiseBoundries[i]; - s.append(String.format("%d ", d <= maxDist ? maxDist : 0)); - if ( d <= maxDist ) { - variantsWithClusters.get(i).add(eL); - } + nSeen++; + StringBuilder s = new StringBuilder(); + for (int i = 0; i < neighborWiseBoundries.length; i++) { + int maxDist = neighborWiseBoundries[i]; + s.append(String.format("%d ", d <= maxDist ? maxDist : 0)); + if ( d <= maxDist ) { + variantsWithClusters[i]++; } - r = d <= minDistanceForFlagging ? String.format("snp_within_cluster %d %s %s %s", d, eL, lvL, s.toString()) : null; } + + // lookup in master for performance reasons + if ( d <= minDistanceForFlagging && getMaster().includeViolations() ) + r = String.format("snp_within_cluster %d %s %s %s", d, eL, lvL, s.toString()); } } + // eval is now the last variation lastVariation = eval; lastVariantInterval = interval; } @@ -79,7 +90,7 @@ public class ClusterCounterAnalysis extends BasicVariantAnalysis implements Geno s.add(String.format("description maxDist count")); for ( int i = 0; i < neighborWiseBoundries.length; i++ ) { int maxDist = neighborWiseBoundries[i]; - int count = variantsWithClusters.get(i).size(); + int count = variantsWithClusters[i]; s.add(String.format("snps_within_clusters_of_size %10d %10d", maxDist, count)); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java index 4d7a3508f..ae96d3f08 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java @@ -27,7 +27,8 @@ import java.util.*; */ @Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="eval",type=ReferenceOrderedDatum.class)}) // right now we have no base variant class for rods, this should change @Allows(value={DataSource.REFERENCE},referenceMetaData = {@RMD(name="eval",type=ReferenceOrderedDatum.class), @RMD(name="dbsnp",type=rodDbSNP.class),@RMD(name="hapmap-chip",type=ReferenceOrderedDatum.class), @RMD(name="interval",type=IntervalRod.class), @RMD(name="validation",type=RodGenotypeChipAsGFF.class)}) -public class VariantEvalWalker extends RefWalker { +//public class VariantEvalWalker extends RefWalker { +public class VariantEvalWalker extends RodWalker { @Argument(shortName="minConfidenceScore", doc="Minimum confidence score to consider an evaluation SNP a variant", required=false) public int minConfidenceScore = -1; @@ -44,7 +45,7 @@ public class VariantEvalWalker extends RefWalker { public boolean explode = false; @Argument(fullName="includeViolations", shortName = "V", doc="If provided, violations will be written out along with summary information", required=false) - public boolean includeViolations = false; + public boolean mIncludeViolations = false; @Argument(fullName="extensiveSubsets", shortName = "A", doc="If provided, output will be calculated over a lot of subsets, by default we only operate over all variants", required=false) public boolean extensiveSubsets = false; @@ -204,6 +205,16 @@ public class VariantEvalWalker extends RefWalker { } public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + nMappedSites += context.getSkippedBases(); + + //System.out.printf("Tracker at %s is %s, ref is %s%n", context.getLocation(), tracker, ref); + //if ( ref == null ) + // out.printf("Last position was %s: skipping %d bases%n", + // context.getLocation(), context.getSkippedBases() ); + if ( ref == null ) { // we are seeing the last site + return 0; + } + nMappedSites++; int nBoundGoodRods = tracker.getNBoundRodTracks("interval"); @@ -244,14 +255,16 @@ public class VariantEvalWalker extends RefWalker { return 1; } + public boolean includeViolations() { return mIncludeViolations; } public void updateAnalysisSet(final ANALYSIS_TYPE analysisSetName, Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) { // Iterate over each analysis, and update it - if (getAnalysisSet(analysisSetName) != null) { - for (VariantAnalysis analysis : getAnalysisSet(analysisSetName)) { + ArrayList set = getAnalysisSet(analysisSetName); + if ( set != null ) { + for ( VariantAnalysis analysis : set ) { String s = analysis.update(eval, tracker, ref, context); - if (s != null && includeViolations) { + if ( s != null && includeViolations() ) { analysis.getCallPrintStream().println(getLineHeader(analysisSetName, "flagged", analysis.getName()) + s); } } diff --git a/java/test/org/broadinstitute/sting/WalkerTest.java b/java/test/org/broadinstitute/sting/WalkerTest.java index ed1355a60..e508e4953 100755 --- a/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/java/test/org/broadinstitute/sting/WalkerTest.java @@ -6,6 +6,8 @@ import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.utils.Pair; import org.broadinstitute.sting.utils.Utils; import org.junit.Test; +import org.apache.log4j.Appender; +import org.apache.log4j.WriterAppender; import java.io.File; import java.io.FileInputStream; @@ -25,12 +27,13 @@ public class WalkerTest extends BaseTest { String filemd5sum = bigInt.toString(16); while (filemd5sum.length() < 32) filemd5sum = "0" + filemd5sum; // pad to length 32 if ( parameterize() || expectedMD5.equals("") ) { - logger.warn(String.format("PARAMETERIZATION[%s]: file %s has md5 = %s, stated expectation is %s, equal? = %b", + System.out.println(String.format("PARAMETERIZATION[%s]: file %s has md5 = %s, stated expectation is %s, equal? = %b", name, resultsFile, filemd5sum, expectedMD5, filemd5sum.equals(expectedMD5))); } else { - logger.warn(String.format("Checking MD5 for %s [calculated=%s, expected=%s]", resultsFile, filemd5sum, expectedMD5)); + System.out.println(String.format("Checking MD5 for %s [calculated=%s, expected=%s]", resultsFile, filemd5sum, expectedMD5)); + System.out.flush(); Assert.assertEquals(name + " Mismatching MD5s", expectedMD5, filemd5sum); - logger.warn(String.format(" => %s PASSED", name)); + System.out.println(String.format(" => %s PASSED", name)); } return filemd5sum; @@ -119,8 +122,8 @@ public class WalkerTest extends BaseTest { } final String args = String.format(spec.args, tmpFiles.toArray()); - logger.warn(Utils.dupString('-', 80)); - logger.warn(String.format("Executing test %s with GATK arguments: %s", name, args)); + System.out.println(Utils.dupString('-', 80)); + System.out.println(String.format("Executing test %s with GATK arguments: %s", name, args)); CommandLineGATK instance = new CommandLineGATK(); CommandLineExecutable.start(instance, args.split(" ")); @@ -134,6 +137,6 @@ public class WalkerTest extends BaseTest { @Test public void testWalkerTest() { - //logger.warn("WalkerTest is just a framework"); + //System.out.println("WalkerTest is just a framework"); } } \ No newline at end of file