Commit for Aaron

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2932 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-03-04 21:29:07 +00:00
parent c20d3e567e
commit 706d49d84c
2 changed files with 107 additions and 19 deletions

View File

@ -4,6 +4,11 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.rodRefSeq;
import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.gatk.walkers.By;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
@ -48,9 +53,9 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, DepthOfCo
@Argument(fullName = "start", doc = "Starting (left endpoint) for granular binning", required = false)
int start = 1;
@Argument(fullName = "stop", doc = "Ending (right endpoint) for granular binning", required = false)
int stop = 1000;
int stop = 500;
@Argument(fullName = "nBins", doc = "Number of bins to use for granular binning", required = false)
int nBins = 20;
int nBins = 499;
@Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth. Defaults to 50.", required = false)
byte minMappingQuality = 50;
@Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to 20.", required = false)
@ -73,6 +78,8 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, DepthOfCo
boolean includeDeletions = false;
@Argument(fullName = "ignoreDeletionSites", doc = "Ignore sites consisting only of deletions", required = false)
boolean ignoreDeletionSites = false;
@Argument(fullName = "calculateCoverageOverGenes", shortName = "geneList", doc = "Calculate the coverage statistics over this list of genes. Currently accepts RefSeq.", required = false)
File refSeqGeneList = null;
////////////////////////////////////////////////////////////////////////////////////
// STANDARD WALKER METHODS
@ -98,17 +105,21 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, DepthOfCo
if ( ! omitDepthOutput ) { // print header
out.printf("%s\t%s\t%s","Locus","Total_Depth","Average_Depth");
//System.out.printf("\t[log]\t%s\t%s\t%s","Locus","Total_Depth","Average_Depth");
// get all the samples
HashSet<String> allSamples = getSamplesFromToolKit(useReadGroup);
for ( String s : allSamples) {
out.printf("\t%s_%s","Depth_for",s);
//System.out.printf("\t%s_%s","Depth_for",s);
if ( printBaseCounts ) {
out.printf("\t%s_%s",s,"_base_counts");
out.printf("\t%s_%s",s,"base_counts");
//System.out.printf("\t%s_%s",s,"base_counts");
}
}
out.printf("%n");
//System.out.printf("%n");
} else {
out.printf("Per-Locus Depth of Coverage output was omitted");
@ -162,6 +173,7 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, DepthOfCo
if ( ! omitDepthOutput ) {
out.printf("%s",ref.getLocus()); // yes: print locus in map, and the rest of the info in reduce (for eventual cumulatives)
//System.out.printf("\t[log]\t%s",ref.getLocus());
}
Map<String,int[]> countsBySample = CoverageUtils.getBaseCountsBySample(context,minMappingQuality,minBaseQuality,
@ -191,6 +203,9 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, DepthOfCo
////////////////////////////////////////////////////////////////////////////////////
public void onTraversalDone( List<Pair<GenomeLoc,DepthOfCoverageStats>> statsByInterval ) {
if ( refSeqGeneList != null ) {
printGeneStats(statsByInterval);
}
File intervalStatisticsFile = deriveFromStream("interval_statistics");
File intervalSummaryFile = deriveFromStream("interval_summary");
DepthOfCoverageStats mergedStats = printIntervalStatsAndMerge(statsByInterval,intervalSummaryFile, intervalStatisticsFile);
@ -263,7 +278,57 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, DepthOfCo
return firstStats;
}
private void printTargetSummary(PrintStream output, Pair<GenomeLoc,DepthOfCoverageStats> intervalStats) {
private void printGeneStats(List<Pair<GenomeLoc,DepthOfCoverageStats>> statsByTarget) {
LocationAwareSeekableRODIterator refseqIterator = initializeRefSeq();
List<Pair<String,DepthOfCoverageStats>> statsByGene = new ArrayList<Pair<String,DepthOfCoverageStats>>();// maintains order
Map<String,DepthOfCoverageStats> geneNamesToStats = new HashMap<String,DepthOfCoverageStats>(); // alows indirect updating of objects in list
for ( Pair<GenomeLoc,DepthOfCoverageStats> targetStats : statsByTarget ) {
String gene = getGeneName(targetStats.first,refseqIterator);
if ( geneNamesToStats.keySet().contains(gene) ) {
geneNamesToStats.get(gene).merge(targetStats.second);
} else {
geneNamesToStats.put(gene,targetStats.second);
statsByGene.add(new Pair<String,DepthOfCoverageStats>(gene,targetStats.second));
}
}
PrintStream geneSummaryOut = getCorrectStream(out,deriveFromStream("gene_summary"));
for ( Pair<String,DepthOfCoverageStats> geneStats : statsByGene ) {
printTargetSummary(geneSummaryOut,geneStats);
}
if ( ! getToolkit().getArguments().outFileName.contains("stdout")) {
geneSummaryOut.close();
}
}
//blatantly stolen from Andrew Kernytsky
private String getGeneName(GenomeLoc target, LocationAwareSeekableRODIterator refseqIterator) {
if (refseqIterator == null) { return "UNKNOWN"; }
RODRecordList annotationList = refseqIterator.seekForward(target);
if (annotationList == null) { return "UNKNOWN"; }
for(ReferenceOrderedDatum rec : annotationList) {
if ( ((rodRefSeq)rec).overlapsExonP(target) ) {
return ((rodRefSeq)rec).getGeneName();
}
}
return "UNKNOWN";
}
private LocationAwareSeekableRODIterator initializeRefSeq() {
ReferenceOrderedData<rodRefSeq> refseq = new ReferenceOrderedData<rodRefSeq>("refseq",
refSeqGeneList, rodRefSeq.class);
return refseq.iterator();
}
private void printTargetSummary(PrintStream output, Pair<?,DepthOfCoverageStats> intervalStats) {
DepthOfCoverageStats stats = intervalStats.second;
int[] bins = stats.getEndpoints();
StringBuilder targetSummary = new StringBuilder();
@ -452,10 +517,11 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, DepthOfCo
private void printSummary(PrintStream out, File optionalFile, DepthOfCoverageStats stats) {
PrintStream output = getCorrectStream(out,optionalFile);
output.printf("%s\t%s\t%s\t%s\t%s%n","sample_id","mean","granular_third_quartile","granular_median","granular_first_quartile");
output.printf("%s\t%s\t%s\t%s\t%s\t%s%n","sample_id","total","mean","granular_third_quartile","granular_median","granular_first_quartile");
Map<String,int[]> histograms = stats.getHistograms();
Map<String,Double> means = stats.getMeans();
Map<String,Long> totals = stats.getTotals();
int[] leftEnds = stats.getEndpoints();
for ( String s : histograms.keySet() ) {
@ -467,10 +533,10 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, DepthOfCo
median = median == histogram.length-1 ? histogram.length-2 : median;
q1 = q1 == histogram.length-1 ? histogram.length-2 : q1;
q3 = q3 == histogram.length-1 ? histogram.length-2 : q3;
output.printf("%s\t%.2f\t%d\t%d\t%d%n",s,means.get(s),leftEnds[q3],leftEnds[median],leftEnds[q1]);
output.printf("%s\t%d\t%.2f\t%d\t%d\t%d%n",s,totals.get(s),means.get(s),leftEnds[q3],leftEnds[median],leftEnds[q1]);
}
output.printf("%s\t%.2f\t%s\t%s\t%s%n","Total",stats.getTotalMeanCoverage(),"N/A","N/A","N/A");
output.printf("%s\t%d\t%.2f\t%s\t%s\t%s%n","Total",stats.getTotalCoverage(),stats.getTotalMeanCoverage(),"N/A","N/A","N/A");
}
private int getQuantile(int[] histogram, double prop) {
@ -507,6 +573,7 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, DepthOfCo
}
// remember -- genome locus was printed in map()
stream.printf("\t%d\t%.2f\t%s%n",tDepth,( (double) tDepth/ (double) allSamples.size()), perSampleOutput);
//System.out.printf("\t%d\t%.2f\t%s%n",tDepth,( (double) tDepth/ (double) allSamples.size()), perSampleOutput);
}

View File

@ -4,7 +4,9 @@ import org.broadinstitute.sting.WalkerTest;
import org.junit.Test;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
@ -14,11 +16,26 @@ import java.util.Arrays;
*/
public class CoverageStatisticsIntegrationTest extends WalkerTest {
private boolean RUN_TESTS = true;
private boolean RUN_TESTS = false;
private String root = "-T CoverageStatistics ";
private String hg18 = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta";
private String b36 = "/broad/1KG/reference/human_b36_both.fasta";
private String buildRootCmd(String ref, String bam, String interval) {
return root + "-R "+ref+" -I "+bam+" -L "+interval;
private String buildRootCmd(String ref, List<String> bams, List<String> intervals) {
StringBuilder bamBuilder = new StringBuilder();
do {
bamBuilder.append(" -I ");
bamBuilder.append(bams.remove(0));
} while ( bams.size() > 0 );
StringBuilder intervalBuilder = new StringBuilder();
do {
intervalBuilder.append(" -L ");
intervalBuilder.append(intervals.remove(0));
} while ( intervals.size() > 0 );
return root + "-R "+ref+bamBuilder.toString()+intervalBuilder.toString();
}
private void execute(String name, WalkerTestSpec spec) {
@ -30,19 +47,23 @@ public class CoverageStatisticsIntegrationTest extends WalkerTest {
@Test
public void testBaseOutputNoFiltering() {
// our base file
File baseOutputFile = this.createTempFile("outputtemp",".tmp");
File baseOutputFile = this.createTempFile("depthofcoveragenofiltering",".tmp");
this.setOutputFileLocation(baseOutputFile);
String bam_file = "/humgen/gsa-hphome1/chartl/projects/depthOfCoverage/testFiles/bams/Ciliopathies_1_88534_3_samples.bam";
String interval_list = "chr1:855534";
String reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta";
String cmd = buildRootCmd(reference,bam_file,interval_list) + " -mmq 0 -mbq 0 -omitSampleSummary -omitLocus";
String expected = "d41d8cd98f00b204e9800998ecf8427e";
String[] intervals = {"1:10,000,000-10,000,800","1:10,250,001-10,250,500","1:10,500,001-10,500,300","1:10,750,001-10,750,400"};
String[] bams = {"/humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam","/broad/1KG/DCC_merged/freeze5/NA19240.pilot2.454.bam"};
String cmd = buildRootCmd(b36,new ArrayList<String>(Arrays.asList(bams)),new ArrayList<String>(Arrays.asList(intervals))) + " -mmq 0 -mbq 0 -dels -baseCounts";
String expected = "cb87d6069ac60c73f047efc6d9386619";
WalkerTestSpec spec = new WalkerTestSpec(cmd,1, Arrays.asList(expected));
// now add the expected files that get generated
spec.addAuxFile("344936e0bb4613544ff137cd1a002d6c",new File(baseOutputFile.getAbsolutePath() + ".interval_statistics"));
execute("testBaseOutputNoFiltering",spec);
spec.addAuxFile("aff2349d6dc221c08f6c469379aeaedf", new File(baseOutputFile.getAbsolutePath() + ".interval_statistics"));
spec.addAuxFile("6476ed0c54a4307a618aa6d3268b050f", new File(baseOutputFile.getAbsolutePath()+".interval_summary"));
spec.addAuxFile("c744a298b7541f3f823e6937e9a0bc67", new File(baseOutputFile.getAbsolutePath()+".locus_statistics"));
spec.addAuxFile("65318c1e73d98a59cc6f817cde12d3d4", new File(baseOutputFile.getAbsolutePath()+".summary_statistics"));
spec.addAuxFile("9fc19f773a7ddfbb473d124e675a3d94", new File(baseOutputFile.getAbsolutePath()+".sample_statistics"));
if ( RUN_TESTS )
execute("testBaseOutputNoFiltering",spec);
}
}