From 706d49d84c46a4842e841ef0b3c90459f052af3f Mon Sep 17 00:00:00 2001 From: chartl Date: Thu, 4 Mar 2010 21:29:07 +0000 Subject: [PATCH] Commit for Aaron git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2932 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/coverage/CoverageStatistics.java | 81 +++++++++++++++++-- .../CoverageStatisticsIntegrationTest.java | 45 ++++++++--- 2 files changed, 107 insertions(+), 19 deletions(-) diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageStatistics.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageStatistics.java index 403e7e6b8..62d0ce879 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageStatistics.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageStatistics.java @@ -4,6 +4,11 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.rodRefSeq; +import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; +import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.gatk.walkers.By; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.LocusWalker; @@ -48,9 +53,9 @@ public class CoverageStatistics extends LocusWalker, DepthOfCo @Argument(fullName = "start", doc = "Starting (left endpoint) for granular binning", required = false) int start = 1; @Argument(fullName = "stop", doc = "Ending (right endpoint) for granular binning", required = false) - int stop = 1000; + int stop = 500; @Argument(fullName = "nBins", doc = "Number of bins to use for granular binning", required = false) - int nBins = 20; + int nBins = 499; @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth. Defaults to 50.", required = false) byte minMappingQuality = 50; @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to 20.", required = false) @@ -73,6 +78,8 @@ public class CoverageStatistics extends LocusWalker, DepthOfCo boolean includeDeletions = false; @Argument(fullName = "ignoreDeletionSites", doc = "Ignore sites consisting only of deletions", required = false) boolean ignoreDeletionSites = false; + @Argument(fullName = "calculateCoverageOverGenes", shortName = "geneList", doc = "Calculate the coverage statistics over this list of genes. Currently accepts RefSeq.", required = false) + File refSeqGeneList = null; //////////////////////////////////////////////////////////////////////////////////// // STANDARD WALKER METHODS @@ -98,17 +105,21 @@ public class CoverageStatistics extends LocusWalker, DepthOfCo if ( ! omitDepthOutput ) { // print header out.printf("%s\t%s\t%s","Locus","Total_Depth","Average_Depth"); + //System.out.printf("\t[log]\t%s\t%s\t%s","Locus","Total_Depth","Average_Depth"); // get all the samples HashSet allSamples = getSamplesFromToolKit(useReadGroup); for ( String s : allSamples) { out.printf("\t%s_%s","Depth_for",s); + //System.out.printf("\t%s_%s","Depth_for",s); if ( printBaseCounts ) { - out.printf("\t%s_%s",s,"_base_counts"); + out.printf("\t%s_%s",s,"base_counts"); + //System.out.printf("\t%s_%s",s,"base_counts"); } } out.printf("%n"); + //System.out.printf("%n"); } else { out.printf("Per-Locus Depth of Coverage output was omitted"); @@ -162,6 +173,7 @@ public class CoverageStatistics extends LocusWalker, DepthOfCo if ( ! omitDepthOutput ) { out.printf("%s",ref.getLocus()); // yes: print locus in map, and the rest of the info in reduce (for eventual cumulatives) + //System.out.printf("\t[log]\t%s",ref.getLocus()); } Map countsBySample = CoverageUtils.getBaseCountsBySample(context,minMappingQuality,minBaseQuality, @@ -191,6 +203,9 @@ public class CoverageStatistics extends LocusWalker, DepthOfCo //////////////////////////////////////////////////////////////////////////////////// public void onTraversalDone( List> statsByInterval ) { + if ( refSeqGeneList != null ) { + printGeneStats(statsByInterval); + } File intervalStatisticsFile = deriveFromStream("interval_statistics"); File intervalSummaryFile = deriveFromStream("interval_summary"); DepthOfCoverageStats mergedStats = printIntervalStatsAndMerge(statsByInterval,intervalSummaryFile, intervalStatisticsFile); @@ -263,7 +278,57 @@ public class CoverageStatistics extends LocusWalker, DepthOfCo return firstStats; } - private void printTargetSummary(PrintStream output, Pair intervalStats) { + private void printGeneStats(List> statsByTarget) { + LocationAwareSeekableRODIterator refseqIterator = initializeRefSeq(); + List> statsByGene = new ArrayList>();// maintains order + Map geneNamesToStats = new HashMap(); // alows indirect updating of objects in list + + for ( Pair targetStats : statsByTarget ) { + String gene = getGeneName(targetStats.first,refseqIterator); + if ( geneNamesToStats.keySet().contains(gene) ) { + geneNamesToStats.get(gene).merge(targetStats.second); + } else { + geneNamesToStats.put(gene,targetStats.second); + statsByGene.add(new Pair(gene,targetStats.second)); + } + } + + PrintStream geneSummaryOut = getCorrectStream(out,deriveFromStream("gene_summary")); + + for ( Pair geneStats : statsByGene ) { + printTargetSummary(geneSummaryOut,geneStats); + } + + if ( ! getToolkit().getArguments().outFileName.contains("stdout")) { + geneSummaryOut.close(); + } + + } + + //blatantly stolen from Andrew Kernytsky + private String getGeneName(GenomeLoc target, LocationAwareSeekableRODIterator refseqIterator) { + if (refseqIterator == null) { return "UNKNOWN"; } + + RODRecordList annotationList = refseqIterator.seekForward(target); + if (annotationList == null) { return "UNKNOWN"; } + + for(ReferenceOrderedDatum rec : annotationList) { + if ( ((rodRefSeq)rec).overlapsExonP(target) ) { + return ((rodRefSeq)rec).getGeneName(); + } + } + + return "UNKNOWN"; + + } + + private LocationAwareSeekableRODIterator initializeRefSeq() { + ReferenceOrderedData refseq = new ReferenceOrderedData("refseq", + refSeqGeneList, rodRefSeq.class); + return refseq.iterator(); + } + + private void printTargetSummary(PrintStream output, Pair intervalStats) { DepthOfCoverageStats stats = intervalStats.second; int[] bins = stats.getEndpoints(); StringBuilder targetSummary = new StringBuilder(); @@ -452,10 +517,11 @@ public class CoverageStatistics extends LocusWalker, DepthOfCo private void printSummary(PrintStream out, File optionalFile, DepthOfCoverageStats stats) { PrintStream output = getCorrectStream(out,optionalFile); - output.printf("%s\t%s\t%s\t%s\t%s%n","sample_id","mean","granular_third_quartile","granular_median","granular_first_quartile"); + output.printf("%s\t%s\t%s\t%s\t%s\t%s%n","sample_id","total","mean","granular_third_quartile","granular_median","granular_first_quartile"); Map histograms = stats.getHistograms(); Map means = stats.getMeans(); + Map totals = stats.getTotals(); int[] leftEnds = stats.getEndpoints(); for ( String s : histograms.keySet() ) { @@ -467,10 +533,10 @@ public class CoverageStatistics extends LocusWalker, DepthOfCo median = median == histogram.length-1 ? histogram.length-2 : median; q1 = q1 == histogram.length-1 ? histogram.length-2 : q1; q3 = q3 == histogram.length-1 ? histogram.length-2 : q3; - output.printf("%s\t%.2f\t%d\t%d\t%d%n",s,means.get(s),leftEnds[q3],leftEnds[median],leftEnds[q1]); + output.printf("%s\t%d\t%.2f\t%d\t%d\t%d%n",s,totals.get(s),means.get(s),leftEnds[q3],leftEnds[median],leftEnds[q1]); } - output.printf("%s\t%.2f\t%s\t%s\t%s%n","Total",stats.getTotalMeanCoverage(),"N/A","N/A","N/A"); + output.printf("%s\t%d\t%.2f\t%s\t%s\t%s%n","Total",stats.getTotalCoverage(),stats.getTotalMeanCoverage(),"N/A","N/A","N/A"); } private int getQuantile(int[] histogram, double prop) { @@ -507,6 +573,7 @@ public class CoverageStatistics extends LocusWalker, DepthOfCo } // remember -- genome locus was printed in map() stream.printf("\t%d\t%.2f\t%s%n",tDepth,( (double) tDepth/ (double) allSamples.size()), perSampleOutput); + //System.out.printf("\t%d\t%.2f\t%s%n",tDepth,( (double) tDepth/ (double) allSamples.size()), perSampleOutput); } diff --git a/java/test/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageStatisticsIntegrationTest.java b/java/test/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageStatisticsIntegrationTest.java index 7c0390769..35328b50c 100644 --- a/java/test/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageStatisticsIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/oneoffprojects/walkers/coverage/CoverageStatisticsIntegrationTest.java @@ -4,7 +4,9 @@ import org.broadinstitute.sting.WalkerTest; import org.junit.Test; import java.io.File; +import java.util.ArrayList; import java.util.Arrays; +import java.util.List; /** * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl @@ -14,11 +16,26 @@ import java.util.Arrays; */ public class CoverageStatisticsIntegrationTest extends WalkerTest { - private boolean RUN_TESTS = true; + private boolean RUN_TESTS = false; private String root = "-T CoverageStatistics "; + private String hg18 = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"; + private String b36 = "/broad/1KG/reference/human_b36_both.fasta"; - private String buildRootCmd(String ref, String bam, String interval) { - return root + "-R "+ref+" -I "+bam+" -L "+interval; + private String buildRootCmd(String ref, List bams, List intervals) { + StringBuilder bamBuilder = new StringBuilder(); + do { + bamBuilder.append(" -I "); + bamBuilder.append(bams.remove(0)); + } while ( bams.size() > 0 ); + + StringBuilder intervalBuilder = new StringBuilder(); + do { + intervalBuilder.append(" -L "); + intervalBuilder.append(intervals.remove(0)); + } while ( intervals.size() > 0 ); + + + return root + "-R "+ref+bamBuilder.toString()+intervalBuilder.toString(); } private void execute(String name, WalkerTestSpec spec) { @@ -30,19 +47,23 @@ public class CoverageStatisticsIntegrationTest extends WalkerTest { @Test public void testBaseOutputNoFiltering() { // our base file - File baseOutputFile = this.createTempFile("outputtemp",".tmp"); + File baseOutputFile = this.createTempFile("depthofcoveragenofiltering",".tmp"); this.setOutputFileLocation(baseOutputFile); - String bam_file = "/humgen/gsa-hphome1/chartl/projects/depthOfCoverage/testFiles/bams/Ciliopathies_1_88534_3_samples.bam"; - String interval_list = "chr1:855534"; - String reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"; - String cmd = buildRootCmd(reference,bam_file,interval_list) + " -mmq 0 -mbq 0 -omitSampleSummary -omitLocus"; - String expected = "d41d8cd98f00b204e9800998ecf8427e"; + String[] intervals = {"1:10,000,000-10,000,800","1:10,250,001-10,250,500","1:10,500,001-10,500,300","1:10,750,001-10,750,400"}; + String[] bams = {"/humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam","/broad/1KG/DCC_merged/freeze5/NA19240.pilot2.454.bam"}; + + String cmd = buildRootCmd(b36,new ArrayList(Arrays.asList(bams)),new ArrayList(Arrays.asList(intervals))) + " -mmq 0 -mbq 0 -dels -baseCounts"; + String expected = "cb87d6069ac60c73f047efc6d9386619"; WalkerTestSpec spec = new WalkerTestSpec(cmd,1, Arrays.asList(expected)); // now add the expected files that get generated - spec.addAuxFile("344936e0bb4613544ff137cd1a002d6c",new File(baseOutputFile.getAbsolutePath() + ".interval_statistics")); - - execute("testBaseOutputNoFiltering",spec); + spec.addAuxFile("aff2349d6dc221c08f6c469379aeaedf", new File(baseOutputFile.getAbsolutePath() + ".interval_statistics")); + spec.addAuxFile("6476ed0c54a4307a618aa6d3268b050f", new File(baseOutputFile.getAbsolutePath()+".interval_summary")); + spec.addAuxFile("c744a298b7541f3f823e6937e9a0bc67", new File(baseOutputFile.getAbsolutePath()+".locus_statistics")); + spec.addAuxFile("65318c1e73d98a59cc6f817cde12d3d4", new File(baseOutputFile.getAbsolutePath()+".summary_statistics")); + spec.addAuxFile("9fc19f773a7ddfbb473d124e675a3d94", new File(baseOutputFile.getAbsolutePath()+".sample_statistics")); + if ( RUN_TESTS ) + execute("testBaseOutputNoFiltering",spec); } }