Now does really neightbor distance calculation, as well as true snp cluster counting
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@998 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
dbf2cc037c
commit
fb7ba47fff
|
|
@ -0,0 +1,88 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.IntervalRod;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.HashSet;
|
||||
import java.io.PrintStream;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: Jun 4, 2009
|
||||
* Time: 4:38:19 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class ClusterCounterAnalysis extends BasicVariantAnalysis {
|
||||
ArrayList<HashSet<GenomeLoc>> variantsWithClusters;
|
||||
int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100};
|
||||
AllelicVariant lastVariant = null;
|
||||
GenomeLoc lastVariantInterval = null;
|
||||
int nSeen = 0;
|
||||
|
||||
public ClusterCounterAnalysis() {
|
||||
super("cluster_counter_analysis");
|
||||
|
||||
variantsWithClusters = new ArrayList<HashSet<GenomeLoc>>(neighborWiseBoundries.length);
|
||||
for (int i = 0; i < neighborWiseBoundries.length; i++)
|
||||
variantsWithClusters.add(new HashSet<GenomeLoc>());
|
||||
}
|
||||
|
||||
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
String r = null;
|
||||
|
||||
if ( eval != null ) {
|
||||
IntervalRod intervalROD = (IntervalRod)tracker.lookup("interval", null);
|
||||
GenomeLoc interval = intervalROD == null ? null : intervalROD.getLocation();
|
||||
|
||||
if (lastVariant != null) {
|
||||
GenomeLoc eL = eval.getLocation();
|
||||
GenomeLoc lvL = lastVariant.getLocation();
|
||||
if (eL.getContigIndex() == lvL.getContigIndex()) {
|
||||
long d = eL.distance(lvL);
|
||||
if ( lastVariantInterval != null && lastVariantInterval.compareTo(interval) != 0) {
|
||||
// we're on different intervals
|
||||
//out.printf("# Excluding %d %s %s vs. %s %s%n", d, eL, interval, lvL, lastVariantInterval);
|
||||
} else {
|
||||
nSeen++;
|
||||
StringBuilder s = new StringBuilder();
|
||||
for (int i = 0; i < neighborWiseBoundries.length; i++) {
|
||||
int maxDist = neighborWiseBoundries[i];
|
||||
s.append(String.format("%d ", d <= maxDist ? maxDist : 0));
|
||||
if ( d <= maxDist ) {
|
||||
variantsWithClusters.get(i).add(eL);
|
||||
}
|
||||
}
|
||||
r = String.format("snp_within_cluster %d %s %s %s", d, eL, lvL, s.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lastVariant = eval;
|
||||
lastVariantInterval = interval;
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
public List<String> done() {
|
||||
List<String> s = new ArrayList<String>();
|
||||
s.add(String.format("snps_counted_for_neighbor_distances %d", nSeen));
|
||||
int cumulative = 0;
|
||||
s.add(String.format("description maxDist count cumulative"));
|
||||
for ( int i = 0; i < neighborWiseBoundries.length; i++ ) {
|
||||
int maxDist = neighborWiseBoundries[i];
|
||||
int count = variantsWithClusters.get(i).size();
|
||||
cumulative += count;
|
||||
s.add(String.format("snps_within_clusters_of_size %10d %10d %10d", maxDist, count, cumulative));
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
|
@ -18,16 +18,16 @@ import java.io.PrintStream;
|
|||
* Time: 4:38:19 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class PairwiseDistanceAnalysis extends BasicVariantAnalysis {
|
||||
ArrayList<Long> pairWiseDistances;
|
||||
int[] pairWiseBoundries = {1, 2, 5, 10, 20, 50, 100, 1000, 10000};
|
||||
public class NeighborDistanceAnalysis extends BasicVariantAnalysis {
|
||||
ArrayList<Long> neighborWiseDistances;
|
||||
int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100, 1000, 10000};
|
||||
|
||||
AllelicVariant lastVariant = null;
|
||||
GenomeLoc lastVariantInterval = null;
|
||||
|
||||
public PairwiseDistanceAnalysis() {
|
||||
super("pairwise_distances");
|
||||
pairWiseDistances = new ArrayList<Long>();
|
||||
public NeighborDistanceAnalysis() {
|
||||
super("neighbor_distances");
|
||||
neighborWiseDistances = new ArrayList<Long>();
|
||||
}
|
||||
|
||||
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
|
|
@ -46,8 +46,8 @@ public class PairwiseDistanceAnalysis extends BasicVariantAnalysis {
|
|||
// we're on different intervals
|
||||
//out.printf("# Excluding %d %s %s vs. %s %s%n", d, eL, interval, lvL, lastVariantInterval);
|
||||
} else {
|
||||
pairWiseDistances.add(d);
|
||||
r = String.format("Pairwise-distance %d %s %s", d, eL, lvL);
|
||||
neighborWiseDistances.add(d);
|
||||
r = String.format("neighbor-distance %d %s %s", d, eL, lvL);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -60,12 +60,12 @@ public class PairwiseDistanceAnalysis extends BasicVariantAnalysis {
|
|||
}
|
||||
|
||||
public List<String> done() {
|
||||
int[] pairCounts = new int[pairWiseBoundries.length];
|
||||
int[] pairCounts = new int[neighborWiseBoundries.length];
|
||||
Arrays.fill(pairCounts, 0);
|
||||
for ( long dist : pairWiseDistances ) {
|
||||
for ( long dist : neighborWiseDistances ) {
|
||||
boolean done = false;
|
||||
for ( int i = 0; i < pairWiseBoundries.length && ! done ; i++ ) {
|
||||
int maxDist = pairWiseBoundries[i];
|
||||
for ( int i = 0; i < neighborWiseBoundries.length && ! done ; i++ ) {
|
||||
int maxDist = neighborWiseBoundries[i];
|
||||
if ( dist <= maxDist ) {
|
||||
pairCounts[i]++;
|
||||
done = true;
|
||||
|
|
@ -74,13 +74,14 @@ public class PairwiseDistanceAnalysis extends BasicVariantAnalysis {
|
|||
}
|
||||
|
||||
List<String> s = new ArrayList<String>();
|
||||
s.add(String.format("snps counted for pairwise distance: %d", pairWiseDistances.size()));
|
||||
s.add(String.format("snps_counted_for_neighbor_distances %d", neighborWiseDistances.size()));
|
||||
int cumulative = 0;
|
||||
for ( int i = 0; i < pairWiseBoundries.length; i++ ) {
|
||||
int maxDist = pairWiseBoundries[i];
|
||||
s.add(String.format("description maxDist count cumulative"));
|
||||
for ( int i = 0; i < neighborWiseBoundries.length; i++ ) {
|
||||
int maxDist = neighborWiseBoundries[i];
|
||||
int count = pairCounts[i];
|
||||
cumulative += count;
|
||||
s.add(String.format("snps within %8d bp: %d %d", maxDist, count, cumulative));
|
||||
s.add(String.format("snps_immediate_neighbors_within_bp %d %d %d", maxDist, count, cumulative));
|
||||
}
|
||||
|
||||
return s;
|
||||
|
|
@ -5,7 +5,6 @@ import org.broadinstitute.sting.gatk.refdata.*;
|
|||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.util.*;
|
||||
|
|
@ -60,9 +59,9 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
|||
analyses.add(new VariantCounter());
|
||||
analyses.add(new VariantDBCoverage("dbSNP"));
|
||||
analyses.add(new TransitionTranversionAnalysis());
|
||||
analyses.add(new PairwiseDistanceAnalysis());
|
||||
analyses.add(new NeighborDistanceAnalysis());
|
||||
analyses.add(new HardyWeinbergEquilibrium(badHWEThreshold));
|
||||
|
||||
analyses.add(new ClusterCounterAnalysis());
|
||||
|
||||
if ( printVariants ) analyses.add(new VariantMatcher("dbSNP"));
|
||||
|
||||
|
|
|
|||
|
|
@ -227,7 +227,7 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable {
|
|||
Collections.sort(locs);
|
||||
//logger.info(String.format("Going to process %d locations", locs.length));
|
||||
locs = mergeOverlappingLocations(locs);
|
||||
logger.info("Locations are:" + Utils.join(", ", locs));
|
||||
logger.debug("Locations are:" + Utils.join(", ", locs));
|
||||
return locs;
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
|
|
|
|||
|
|
@ -153,7 +153,7 @@ def aggregateGeliCalls( sortedGeliCalls ):
|
|||
#return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
|
||||
return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
|
||||
|
||||
def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False ):
|
||||
def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False, memLimit = '-Xmx4096m' ):
|
||||
if useSamtools:
|
||||
return SAMTOOLS_MERGE_BIN + ' ' + output_filename + ' ' + ' '.join(inputFiles)
|
||||
else:
|
||||
|
|
@ -164,7 +164,7 @@ def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True,
|
|||
MSDStr = ''
|
||||
if MSD: MSDStr = 'MSD=true'
|
||||
|
||||
return 'java -Xmx4096m -jar ' + mergeBin + ' ' + MSDStr + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
||||
return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
||||
#return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
||||
|
||||
def getPicardPath(lane, picardRoot = '/seq/picard/'):
|
||||
|
|
|
|||
Loading…
Reference in New Issue