Now does really neightbor distance calculation, as well as true snp cluster counting
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@998 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
dbf2cc037c
commit
fb7ba47fff
|
|
@ -0,0 +1,88 @@
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.IntervalRod;
|
||||||
|
import org.broadinstitute.sting.gatk.LocusContext;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.io.PrintStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: depristo
|
||||||
|
* Date: Jun 4, 2009
|
||||||
|
* Time: 4:38:19 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class ClusterCounterAnalysis extends BasicVariantAnalysis {
|
||||||
|
ArrayList<HashSet<GenomeLoc>> variantsWithClusters;
|
||||||
|
int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100};
|
||||||
|
AllelicVariant lastVariant = null;
|
||||||
|
GenomeLoc lastVariantInterval = null;
|
||||||
|
int nSeen = 0;
|
||||||
|
|
||||||
|
public ClusterCounterAnalysis() {
|
||||||
|
super("cluster_counter_analysis");
|
||||||
|
|
||||||
|
variantsWithClusters = new ArrayList<HashSet<GenomeLoc>>(neighborWiseBoundries.length);
|
||||||
|
for (int i = 0; i < neighborWiseBoundries.length; i++)
|
||||||
|
variantsWithClusters.add(new HashSet<GenomeLoc>());
|
||||||
|
}
|
||||||
|
|
||||||
|
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||||
|
String r = null;
|
||||||
|
|
||||||
|
if ( eval != null ) {
|
||||||
|
IntervalRod intervalROD = (IntervalRod)tracker.lookup("interval", null);
|
||||||
|
GenomeLoc interval = intervalROD == null ? null : intervalROD.getLocation();
|
||||||
|
|
||||||
|
if (lastVariant != null) {
|
||||||
|
GenomeLoc eL = eval.getLocation();
|
||||||
|
GenomeLoc lvL = lastVariant.getLocation();
|
||||||
|
if (eL.getContigIndex() == lvL.getContigIndex()) {
|
||||||
|
long d = eL.distance(lvL);
|
||||||
|
if ( lastVariantInterval != null && lastVariantInterval.compareTo(interval) != 0) {
|
||||||
|
// we're on different intervals
|
||||||
|
//out.printf("# Excluding %d %s %s vs. %s %s%n", d, eL, interval, lvL, lastVariantInterval);
|
||||||
|
} else {
|
||||||
|
nSeen++;
|
||||||
|
StringBuilder s = new StringBuilder();
|
||||||
|
for (int i = 0; i < neighborWiseBoundries.length; i++) {
|
||||||
|
int maxDist = neighborWiseBoundries[i];
|
||||||
|
s.append(String.format("%d ", d <= maxDist ? maxDist : 0));
|
||||||
|
if ( d <= maxDist ) {
|
||||||
|
variantsWithClusters.get(i).add(eL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
r = String.format("snp_within_cluster %d %s %s %s", d, eL, lvL, s.toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
lastVariant = eval;
|
||||||
|
lastVariantInterval = interval;
|
||||||
|
}
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> done() {
|
||||||
|
List<String> s = new ArrayList<String>();
|
||||||
|
s.add(String.format("snps_counted_for_neighbor_distances %d", nSeen));
|
||||||
|
int cumulative = 0;
|
||||||
|
s.add(String.format("description maxDist count cumulative"));
|
||||||
|
for ( int i = 0; i < neighborWiseBoundries.length; i++ ) {
|
||||||
|
int maxDist = neighborWiseBoundries[i];
|
||||||
|
int count = variantsWithClusters.get(i).size();
|
||||||
|
cumulative += count;
|
||||||
|
s.add(String.format("snps_within_clusters_of_size %10d %10d %10d", maxDist, count, cumulative));
|
||||||
|
}
|
||||||
|
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -18,16 +18,16 @@ import java.io.PrintStream;
|
||||||
* Time: 4:38:19 PM
|
* Time: 4:38:19 PM
|
||||||
* To change this template use File | Settings | File Templates.
|
* To change this template use File | Settings | File Templates.
|
||||||
*/
|
*/
|
||||||
public class PairwiseDistanceAnalysis extends BasicVariantAnalysis {
|
public class NeighborDistanceAnalysis extends BasicVariantAnalysis {
|
||||||
ArrayList<Long> pairWiseDistances;
|
ArrayList<Long> neighborWiseDistances;
|
||||||
int[] pairWiseBoundries = {1, 2, 5, 10, 20, 50, 100, 1000, 10000};
|
int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100, 1000, 10000};
|
||||||
|
|
||||||
AllelicVariant lastVariant = null;
|
AllelicVariant lastVariant = null;
|
||||||
GenomeLoc lastVariantInterval = null;
|
GenomeLoc lastVariantInterval = null;
|
||||||
|
|
||||||
public PairwiseDistanceAnalysis() {
|
public NeighborDistanceAnalysis() {
|
||||||
super("pairwise_distances");
|
super("neighbor_distances");
|
||||||
pairWiseDistances = new ArrayList<Long>();
|
neighborWiseDistances = new ArrayList<Long>();
|
||||||
}
|
}
|
||||||
|
|
||||||
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
|
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||||
|
|
@ -46,8 +46,8 @@ public class PairwiseDistanceAnalysis extends BasicVariantAnalysis {
|
||||||
// we're on different intervals
|
// we're on different intervals
|
||||||
//out.printf("# Excluding %d %s %s vs. %s %s%n", d, eL, interval, lvL, lastVariantInterval);
|
//out.printf("# Excluding %d %s %s vs. %s %s%n", d, eL, interval, lvL, lastVariantInterval);
|
||||||
} else {
|
} else {
|
||||||
pairWiseDistances.add(d);
|
neighborWiseDistances.add(d);
|
||||||
r = String.format("Pairwise-distance %d %s %s", d, eL, lvL);
|
r = String.format("neighbor-distance %d %s %s", d, eL, lvL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -60,12 +60,12 @@ public class PairwiseDistanceAnalysis extends BasicVariantAnalysis {
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<String> done() {
|
public List<String> done() {
|
||||||
int[] pairCounts = new int[pairWiseBoundries.length];
|
int[] pairCounts = new int[neighborWiseBoundries.length];
|
||||||
Arrays.fill(pairCounts, 0);
|
Arrays.fill(pairCounts, 0);
|
||||||
for ( long dist : pairWiseDistances ) {
|
for ( long dist : neighborWiseDistances ) {
|
||||||
boolean done = false;
|
boolean done = false;
|
||||||
for ( int i = 0; i < pairWiseBoundries.length && ! done ; i++ ) {
|
for ( int i = 0; i < neighborWiseBoundries.length && ! done ; i++ ) {
|
||||||
int maxDist = pairWiseBoundries[i];
|
int maxDist = neighborWiseBoundries[i];
|
||||||
if ( dist <= maxDist ) {
|
if ( dist <= maxDist ) {
|
||||||
pairCounts[i]++;
|
pairCounts[i]++;
|
||||||
done = true;
|
done = true;
|
||||||
|
|
@ -74,13 +74,14 @@ public class PairwiseDistanceAnalysis extends BasicVariantAnalysis {
|
||||||
}
|
}
|
||||||
|
|
||||||
List<String> s = new ArrayList<String>();
|
List<String> s = new ArrayList<String>();
|
||||||
s.add(String.format("snps counted for pairwise distance: %d", pairWiseDistances.size()));
|
s.add(String.format("snps_counted_for_neighbor_distances %d", neighborWiseDistances.size()));
|
||||||
int cumulative = 0;
|
int cumulative = 0;
|
||||||
for ( int i = 0; i < pairWiseBoundries.length; i++ ) {
|
s.add(String.format("description maxDist count cumulative"));
|
||||||
int maxDist = pairWiseBoundries[i];
|
for ( int i = 0; i < neighborWiseBoundries.length; i++ ) {
|
||||||
|
int maxDist = neighborWiseBoundries[i];
|
||||||
int count = pairCounts[i];
|
int count = pairCounts[i];
|
||||||
cumulative += count;
|
cumulative += count;
|
||||||
s.add(String.format("snps within %8d bp: %d %d", maxDist, count, cumulative));
|
s.add(String.format("snps_immediate_neighbors_within_bp %d %d %d", maxDist, count, cumulative));
|
||||||
}
|
}
|
||||||
|
|
||||||
return s;
|
return s;
|
||||||
|
|
@ -5,7 +5,6 @@ import org.broadinstitute.sting.gatk.refdata.*;
|
||||||
import org.broadinstitute.sting.gatk.LocusContext;
|
import org.broadinstitute.sting.gatk.LocusContext;
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
import org.broadinstitute.sting.utils.Pair;
|
|
||||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
@ -60,9 +59,9 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
||||||
analyses.add(new VariantCounter());
|
analyses.add(new VariantCounter());
|
||||||
analyses.add(new VariantDBCoverage("dbSNP"));
|
analyses.add(new VariantDBCoverage("dbSNP"));
|
||||||
analyses.add(new TransitionTranversionAnalysis());
|
analyses.add(new TransitionTranversionAnalysis());
|
||||||
analyses.add(new PairwiseDistanceAnalysis());
|
analyses.add(new NeighborDistanceAnalysis());
|
||||||
analyses.add(new HardyWeinbergEquilibrium(badHWEThreshold));
|
analyses.add(new HardyWeinbergEquilibrium(badHWEThreshold));
|
||||||
|
analyses.add(new ClusterCounterAnalysis());
|
||||||
|
|
||||||
if ( printVariants ) analyses.add(new VariantMatcher("dbSNP"));
|
if ( printVariants ) analyses.add(new VariantMatcher("dbSNP"));
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -227,7 +227,7 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable {
|
||||||
Collections.sort(locs);
|
Collections.sort(locs);
|
||||||
//logger.info(String.format("Going to process %d locations", locs.length));
|
//logger.info(String.format("Going to process %d locations", locs.length));
|
||||||
locs = mergeOverlappingLocations(locs);
|
locs = mergeOverlappingLocations(locs);
|
||||||
logger.info("Locations are:" + Utils.join(", ", locs));
|
logger.debug("Locations are:" + Utils.join(", ", locs));
|
||||||
return locs;
|
return locs;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
|
|
|
||||||
|
|
@ -153,7 +153,7 @@ def aggregateGeliCalls( sortedGeliCalls ):
|
||||||
#return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
|
#return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
|
||||||
return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
|
return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
|
||||||
|
|
||||||
def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False ):
|
def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False, memLimit = '-Xmx4096m' ):
|
||||||
if useSamtools:
|
if useSamtools:
|
||||||
return SAMTOOLS_MERGE_BIN + ' ' + output_filename + ' ' + ' '.join(inputFiles)
|
return SAMTOOLS_MERGE_BIN + ' ' + output_filename + ' ' + ' '.join(inputFiles)
|
||||||
else:
|
else:
|
||||||
|
|
@ -164,7 +164,7 @@ def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True,
|
||||||
MSDStr = ''
|
MSDStr = ''
|
||||||
if MSD: MSDStr = 'MSD=true'
|
if MSD: MSDStr = 'MSD=true'
|
||||||
|
|
||||||
return 'java -Xmx4096m -jar ' + mergeBin + ' ' + MSDStr + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
||||||
#return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
#return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
||||||
|
|
||||||
def getPicardPath(lane, picardRoot = '/seq/picard/'):
|
def getPicardPath(lane, picardRoot = '/seq/picard/'):
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue