Now does really neightbor distance calculation, as well as true snp cluster counting

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@998 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-06-12 16:29:26 +00:00
parent dbf2cc037c
commit fb7ba47fff
5 changed files with 110 additions and 22 deletions

View File

@ -0,0 +1,88 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.IntervalRod;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.HashSet;
import java.io.PrintStream;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: Jun 4, 2009
* Time: 4:38:19 PM
* To change this template use File | Settings | File Templates.
*/
public class ClusterCounterAnalysis extends BasicVariantAnalysis {
ArrayList<HashSet<GenomeLoc>> variantsWithClusters;
int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100};
AllelicVariant lastVariant = null;
GenomeLoc lastVariantInterval = null;
int nSeen = 0;
public ClusterCounterAnalysis() {
super("cluster_counter_analysis");
variantsWithClusters = new ArrayList<HashSet<GenomeLoc>>(neighborWiseBoundries.length);
for (int i = 0; i < neighborWiseBoundries.length; i++)
variantsWithClusters.add(new HashSet<GenomeLoc>());
}
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
String r = null;
if ( eval != null ) {
IntervalRod intervalROD = (IntervalRod)tracker.lookup("interval", null);
GenomeLoc interval = intervalROD == null ? null : intervalROD.getLocation();
if (lastVariant != null) {
GenomeLoc eL = eval.getLocation();
GenomeLoc lvL = lastVariant.getLocation();
if (eL.getContigIndex() == lvL.getContigIndex()) {
long d = eL.distance(lvL);
if ( lastVariantInterval != null && lastVariantInterval.compareTo(interval) != 0) {
// we're on different intervals
//out.printf("# Excluding %d %s %s vs. %s %s%n", d, eL, interval, lvL, lastVariantInterval);
} else {
nSeen++;
StringBuilder s = new StringBuilder();
for (int i = 0; i < neighborWiseBoundries.length; i++) {
int maxDist = neighborWiseBoundries[i];
s.append(String.format("%d ", d <= maxDist ? maxDist : 0));
if ( d <= maxDist ) {
variantsWithClusters.get(i).add(eL);
}
}
r = String.format("snp_within_cluster %d %s %s %s", d, eL, lvL, s.toString());
}
}
}
lastVariant = eval;
lastVariantInterval = interval;
}
return r;
}
public List<String> done() {
List<String> s = new ArrayList<String>();
s.add(String.format("snps_counted_for_neighbor_distances %d", nSeen));
int cumulative = 0;
s.add(String.format("description maxDist count cumulative"));
for ( int i = 0; i < neighborWiseBoundries.length; i++ ) {
int maxDist = neighborWiseBoundries[i];
int count = variantsWithClusters.get(i).size();
cumulative += count;
s.add(String.format("snps_within_clusters_of_size %10d %10d %10d", maxDist, count, cumulative));
}
return s;
}
}

View File

@ -18,16 +18,16 @@ import java.io.PrintStream;
* Time: 4:38:19 PM
* To change this template use File | Settings | File Templates.
*/
public class PairwiseDistanceAnalysis extends BasicVariantAnalysis {
ArrayList<Long> pairWiseDistances;
int[] pairWiseBoundries = {1, 2, 5, 10, 20, 50, 100, 1000, 10000};
public class NeighborDistanceAnalysis extends BasicVariantAnalysis {
ArrayList<Long> neighborWiseDistances;
int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100, 1000, 10000};
AllelicVariant lastVariant = null;
GenomeLoc lastVariantInterval = null;
public PairwiseDistanceAnalysis() {
super("pairwise_distances");
pairWiseDistances = new ArrayList<Long>();
public NeighborDistanceAnalysis() {
super("neighbor_distances");
neighborWiseDistances = new ArrayList<Long>();
}
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
@ -46,8 +46,8 @@ public class PairwiseDistanceAnalysis extends BasicVariantAnalysis {
// we're on different intervals
//out.printf("# Excluding %d %s %s vs. %s %s%n", d, eL, interval, lvL, lastVariantInterval);
} else {
pairWiseDistances.add(d);
r = String.format("Pairwise-distance %d %s %s", d, eL, lvL);
neighborWiseDistances.add(d);
r = String.format("neighbor-distance %d %s %s", d, eL, lvL);
}
}
}
@ -60,12 +60,12 @@ public class PairwiseDistanceAnalysis extends BasicVariantAnalysis {
}
public List<String> done() {
int[] pairCounts = new int[pairWiseBoundries.length];
int[] pairCounts = new int[neighborWiseBoundries.length];
Arrays.fill(pairCounts, 0);
for ( long dist : pairWiseDistances ) {
for ( long dist : neighborWiseDistances ) {
boolean done = false;
for ( int i = 0; i < pairWiseBoundries.length && ! done ; i++ ) {
int maxDist = pairWiseBoundries[i];
for ( int i = 0; i < neighborWiseBoundries.length && ! done ; i++ ) {
int maxDist = neighborWiseBoundries[i];
if ( dist <= maxDist ) {
pairCounts[i]++;
done = true;
@ -74,13 +74,14 @@ public class PairwiseDistanceAnalysis extends BasicVariantAnalysis {
}
List<String> s = new ArrayList<String>();
s.add(String.format("snps counted for pairwise distance: %d", pairWiseDistances.size()));
s.add(String.format("snps_counted_for_neighbor_distances %d", neighborWiseDistances.size()));
int cumulative = 0;
for ( int i = 0; i < pairWiseBoundries.length; i++ ) {
int maxDist = pairWiseBoundries[i];
s.add(String.format("description maxDist count cumulative"));
for ( int i = 0; i < neighborWiseBoundries.length; i++ ) {
int maxDist = neighborWiseBoundries[i];
int count = pairCounts[i];
cumulative += count;
s.add(String.format("snps within %8d bp: %d %d", maxDist, count, cumulative));
s.add(String.format("snps_immediate_neighbors_within_bp %d %d %d", maxDist, count, cumulative));
}
return s;

View File

@ -5,7 +5,6 @@ import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.*;
@ -60,9 +59,9 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
analyses.add(new VariantCounter());
analyses.add(new VariantDBCoverage("dbSNP"));
analyses.add(new TransitionTranversionAnalysis());
analyses.add(new PairwiseDistanceAnalysis());
analyses.add(new NeighborDistanceAnalysis());
analyses.add(new HardyWeinbergEquilibrium(badHWEThreshold));
analyses.add(new ClusterCounterAnalysis());
if ( printVariants ) analyses.add(new VariantMatcher("dbSNP"));

View File

@ -227,7 +227,7 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable {
Collections.sort(locs);
//logger.info(String.format("Going to process %d locations", locs.length));
locs = mergeOverlappingLocations(locs);
logger.info("Locations are:" + Utils.join(", ", locs));
logger.debug("Locations are:" + Utils.join(", ", locs));
return locs;
} catch (Exception e) {
e.printStackTrace();

View File

@ -153,7 +153,7 @@ def aggregateGeliCalls( sortedGeliCalls ):
#return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False ):
def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False, memLimit = '-Xmx4096m' ):
if useSamtools:
return SAMTOOLS_MERGE_BIN + ' ' + output_filename + ' ' + ' '.join(inputFiles)
else:
@ -164,7 +164,7 @@ def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True,
MSDStr = ''
if MSD: MSDStr = 'MSD=true'
return 'java -Xmx4096m -jar ' + mergeBin + ' ' + MSDStr + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
#return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
def getPicardPath(lane, picardRoot = '/seq/picard/'):