From fb7ba47fffff0699118b8f74fea507eb7ff9acdc Mon Sep 17 00:00:00 2001 From: depristo Date: Fri, 12 Jun 2009 16:29:26 +0000 Subject: [PATCH] Now does really neightbor distance calculation, as well as true snp cluster counting git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@998 348d0f76-0448-11de-a6fe-93d51630548a --- .../varianteval/ClusterCounterAnalysis.java | 88 +++++++++++++++++++ ...sis.java => NeighborDistanceAnalysis.java} | 33 +++---- .../varianteval/VariantEvalWalker.java | 5 +- .../broadinstitute/sting/utils/GenomeLoc.java | 2 +- python/picard_utils.py | 4 +- 5 files changed, 110 insertions(+), 22 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/ClusterCounterAnalysis.java rename java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/{PairwiseDistanceAnalysis.java => NeighborDistanceAnalysis.java} (67%) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/ClusterCounterAnalysis.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/ClusterCounterAnalysis.java new file mode 100755 index 000000000..a1b904cbf --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/ClusterCounterAnalysis.java @@ -0,0 +1,88 @@ +package org.broadinstitute.sting.playground.gatk.walkers.varianteval; + +import org.broadinstitute.sting.gatk.refdata.AllelicVariant; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.IntervalRod; +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.HashSet; +import java.io.PrintStream; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: Jun 4, 2009 + * Time: 4:38:19 PM + * To change this template use File | Settings | File Templates. + */ +public class ClusterCounterAnalysis extends BasicVariantAnalysis { + ArrayList> variantsWithClusters; + int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100}; + AllelicVariant lastVariant = null; + GenomeLoc lastVariantInterval = null; + int nSeen = 0; + + public ClusterCounterAnalysis() { + super("cluster_counter_analysis"); + + variantsWithClusters = new ArrayList>(neighborWiseBoundries.length); + for (int i = 0; i < neighborWiseBoundries.length; i++) + variantsWithClusters.add(new HashSet()); + } + + public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) { + String r = null; + + if ( eval != null ) { + IntervalRod intervalROD = (IntervalRod)tracker.lookup("interval", null); + GenomeLoc interval = intervalROD == null ? null : intervalROD.getLocation(); + + if (lastVariant != null) { + GenomeLoc eL = eval.getLocation(); + GenomeLoc lvL = lastVariant.getLocation(); + if (eL.getContigIndex() == lvL.getContigIndex()) { + long d = eL.distance(lvL); + if ( lastVariantInterval != null && lastVariantInterval.compareTo(interval) != 0) { + // we're on different intervals + //out.printf("# Excluding %d %s %s vs. %s %s%n", d, eL, interval, lvL, lastVariantInterval); + } else { + nSeen++; + StringBuilder s = new StringBuilder(); + for (int i = 0; i < neighborWiseBoundries.length; i++) { + int maxDist = neighborWiseBoundries[i]; + s.append(String.format("%d ", d <= maxDist ? maxDist : 0)); + if ( d <= maxDist ) { + variantsWithClusters.get(i).add(eL); + } + } + r = String.format("snp_within_cluster %d %s %s %s", d, eL, lvL, s.toString()); + } + } + } + + lastVariant = eval; + lastVariantInterval = interval; + } + + return r; + } + + public List done() { + List s = new ArrayList(); + s.add(String.format("snps_counted_for_neighbor_distances %d", nSeen)); + int cumulative = 0; + s.add(String.format("description maxDist count cumulative")); + for ( int i = 0; i < neighborWiseBoundries.length; i++ ) { + int maxDist = neighborWiseBoundries[i]; + int count = variantsWithClusters.get(i).size(); + cumulative += count; + s.add(String.format("snps_within_clusters_of_size %10d %10d %10d", maxDist, count, cumulative)); + } + + return s; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/PairwiseDistanceAnalysis.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/NeighborDistanceAnalysis.java similarity index 67% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/PairwiseDistanceAnalysis.java rename to java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/NeighborDistanceAnalysis.java index 0090acb39..ea524a32a 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/PairwiseDistanceAnalysis.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/NeighborDistanceAnalysis.java @@ -18,16 +18,16 @@ import java.io.PrintStream; * Time: 4:38:19 PM * To change this template use File | Settings | File Templates. */ -public class PairwiseDistanceAnalysis extends BasicVariantAnalysis { - ArrayList pairWiseDistances; - int[] pairWiseBoundries = {1, 2, 5, 10, 20, 50, 100, 1000, 10000}; +public class NeighborDistanceAnalysis extends BasicVariantAnalysis { + ArrayList neighborWiseDistances; + int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100, 1000, 10000}; AllelicVariant lastVariant = null; GenomeLoc lastVariantInterval = null; - public PairwiseDistanceAnalysis() { - super("pairwise_distances"); - pairWiseDistances = new ArrayList(); + public NeighborDistanceAnalysis() { + super("neighbor_distances"); + neighborWiseDistances = new ArrayList(); } public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) { @@ -46,8 +46,8 @@ public class PairwiseDistanceAnalysis extends BasicVariantAnalysis { // we're on different intervals //out.printf("# Excluding %d %s %s vs. %s %s%n", d, eL, interval, lvL, lastVariantInterval); } else { - pairWiseDistances.add(d); - r = String.format("Pairwise-distance %d %s %s", d, eL, lvL); + neighborWiseDistances.add(d); + r = String.format("neighbor-distance %d %s %s", d, eL, lvL); } } } @@ -60,12 +60,12 @@ public class PairwiseDistanceAnalysis extends BasicVariantAnalysis { } public List done() { - int[] pairCounts = new int[pairWiseBoundries.length]; + int[] pairCounts = new int[neighborWiseBoundries.length]; Arrays.fill(pairCounts, 0); - for ( long dist : pairWiseDistances ) { + for ( long dist : neighborWiseDistances ) { boolean done = false; - for ( int i = 0; i < pairWiseBoundries.length && ! done ; i++ ) { - int maxDist = pairWiseBoundries[i]; + for ( int i = 0; i < neighborWiseBoundries.length && ! done ; i++ ) { + int maxDist = neighborWiseBoundries[i]; if ( dist <= maxDist ) { pairCounts[i]++; done = true; @@ -74,13 +74,14 @@ public class PairwiseDistanceAnalysis extends BasicVariantAnalysis { } List s = new ArrayList(); - s.add(String.format("snps counted for pairwise distance: %d", pairWiseDistances.size())); + s.add(String.format("snps_counted_for_neighbor_distances %d", neighborWiseDistances.size())); int cumulative = 0; - for ( int i = 0; i < pairWiseBoundries.length; i++ ) { - int maxDist = pairWiseBoundries[i]; + s.add(String.format("description maxDist count cumulative")); + for ( int i = 0; i < neighborWiseBoundries.length; i++ ) { + int maxDist = neighborWiseBoundries[i]; int count = pairCounts[i]; cumulative += count; - s.add(String.format("snps within %8d bp: %d %d", maxDist, count, cumulative)); + s.add(String.format("snps_immediate_neighbors_within_bp %d %d %d", maxDist, count, cumulative)); } return s; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java index db2aaa051..74ef78b0d 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java @@ -5,7 +5,6 @@ import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.Pair; import org.broadinstitute.sting.utils.cmdLine.Argument; import java.util.*; @@ -60,9 +59,9 @@ public class VariantEvalWalker extends RefWalker { analyses.add(new VariantCounter()); analyses.add(new VariantDBCoverage("dbSNP")); analyses.add(new TransitionTranversionAnalysis()); - analyses.add(new PairwiseDistanceAnalysis()); + analyses.add(new NeighborDistanceAnalysis()); analyses.add(new HardyWeinbergEquilibrium(badHWEThreshold)); - + analyses.add(new ClusterCounterAnalysis()); if ( printVariants ) analyses.add(new VariantMatcher("dbSNP")); diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index b75def849..323b16ec6 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -227,7 +227,7 @@ public class GenomeLoc implements Comparable, Cloneable { Collections.sort(locs); //logger.info(String.format("Going to process %d locations", locs.length)); locs = mergeOverlappingLocations(locs); - logger.info("Locations are:" + Utils.join(", ", locs)); + logger.debug("Locations are:" + Utils.join(", ", locs)); return locs; } catch (Exception e) { e.printStackTrace(); diff --git a/python/picard_utils.py b/python/picard_utils.py index d8df8f4b9..3377b782f 100755 --- a/python/picard_utils.py +++ b/python/picard_utils.py @@ -153,7 +153,7 @@ def aggregateGeliCalls( sortedGeliCalls ): #return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)] return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)] -def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False ): +def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False, memLimit = '-Xmx4096m' ): if useSamtools: return SAMTOOLS_MERGE_BIN + ' ' + output_filename + ' ' + ' '.join(inputFiles) else: @@ -164,7 +164,7 @@ def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, MSDStr = '' if MSD: MSDStr = 'MSD=true' - return 'java -Xmx4096m -jar ' + mergeBin + ' ' + MSDStr + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles)) + return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles)) #return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles)) def getPicardPath(lane, picardRoot = '/seq/picard/'):