Commit for Aaron

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2932 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-03-04 21:29:07 +00:00
parent c20d3e567e
commit 706d49d84c
2 changed files with 107 additions and 19 deletions

View File

@ -4,6 +4,11 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.rodRefSeq;
import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.gatk.walkers.By; import org.broadinstitute.sting.gatk.walkers.By;
import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.LocusWalker;
@ -48,9 +53,9 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, DepthOfCo
@Argument(fullName = "start", doc = "Starting (left endpoint) for granular binning", required = false) @Argument(fullName = "start", doc = "Starting (left endpoint) for granular binning", required = false)
int start = 1; int start = 1;
@Argument(fullName = "stop", doc = "Ending (right endpoint) for granular binning", required = false) @Argument(fullName = "stop", doc = "Ending (right endpoint) for granular binning", required = false)
int stop = 1000; int stop = 500;
@Argument(fullName = "nBins", doc = "Number of bins to use for granular binning", required = false) @Argument(fullName = "nBins", doc = "Number of bins to use for granular binning", required = false)
int nBins = 20; int nBins = 499;
@Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth. Defaults to 50.", required = false) @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth. Defaults to 50.", required = false)
byte minMappingQuality = 50; byte minMappingQuality = 50;
@Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to 20.", required = false) @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to 20.", required = false)
@ -73,6 +78,8 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, DepthOfCo
boolean includeDeletions = false; boolean includeDeletions = false;
@Argument(fullName = "ignoreDeletionSites", doc = "Ignore sites consisting only of deletions", required = false) @Argument(fullName = "ignoreDeletionSites", doc = "Ignore sites consisting only of deletions", required = false)
boolean ignoreDeletionSites = false; boolean ignoreDeletionSites = false;
@Argument(fullName = "calculateCoverageOverGenes", shortName = "geneList", doc = "Calculate the coverage statistics over this list of genes. Currently accepts RefSeq.", required = false)
File refSeqGeneList = null;
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
// STANDARD WALKER METHODS // STANDARD WALKER METHODS
@ -98,17 +105,21 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, DepthOfCo
if ( ! omitDepthOutput ) { // print header if ( ! omitDepthOutput ) { // print header
out.printf("%s\t%s\t%s","Locus","Total_Depth","Average_Depth"); out.printf("%s\t%s\t%s","Locus","Total_Depth","Average_Depth");
//System.out.printf("\t[log]\t%s\t%s\t%s","Locus","Total_Depth","Average_Depth");
// get all the samples // get all the samples
HashSet<String> allSamples = getSamplesFromToolKit(useReadGroup); HashSet<String> allSamples = getSamplesFromToolKit(useReadGroup);
for ( String s : allSamples) { for ( String s : allSamples) {
out.printf("\t%s_%s","Depth_for",s); out.printf("\t%s_%s","Depth_for",s);
//System.out.printf("\t%s_%s","Depth_for",s);
if ( printBaseCounts ) { if ( printBaseCounts ) {
out.printf("\t%s_%s",s,"_base_counts"); out.printf("\t%s_%s",s,"base_counts");
//System.out.printf("\t%s_%s",s,"base_counts");
} }
} }
out.printf("%n"); out.printf("%n");
//System.out.printf("%n");
} else { } else {
out.printf("Per-Locus Depth of Coverage output was omitted"); out.printf("Per-Locus Depth of Coverage output was omitted");
@ -162,6 +173,7 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, DepthOfCo
if ( ! omitDepthOutput ) { if ( ! omitDepthOutput ) {
out.printf("%s",ref.getLocus()); // yes: print locus in map, and the rest of the info in reduce (for eventual cumulatives) out.printf("%s",ref.getLocus()); // yes: print locus in map, and the rest of the info in reduce (for eventual cumulatives)
//System.out.printf("\t[log]\t%s",ref.getLocus());
} }
Map<String,int[]> countsBySample = CoverageUtils.getBaseCountsBySample(context,minMappingQuality,minBaseQuality, Map<String,int[]> countsBySample = CoverageUtils.getBaseCountsBySample(context,minMappingQuality,minBaseQuality,
@ -191,6 +203,9 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, DepthOfCo
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
public void onTraversalDone( List<Pair<GenomeLoc,DepthOfCoverageStats>> statsByInterval ) { public void onTraversalDone( List<Pair<GenomeLoc,DepthOfCoverageStats>> statsByInterval ) {
if ( refSeqGeneList != null ) {
printGeneStats(statsByInterval);
}
File intervalStatisticsFile = deriveFromStream("interval_statistics"); File intervalStatisticsFile = deriveFromStream("interval_statistics");
File intervalSummaryFile = deriveFromStream("interval_summary"); File intervalSummaryFile = deriveFromStream("interval_summary");
DepthOfCoverageStats mergedStats = printIntervalStatsAndMerge(statsByInterval,intervalSummaryFile, intervalStatisticsFile); DepthOfCoverageStats mergedStats = printIntervalStatsAndMerge(statsByInterval,intervalSummaryFile, intervalStatisticsFile);
@ -263,7 +278,57 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, DepthOfCo
return firstStats; return firstStats;
} }
private void printTargetSummary(PrintStream output, Pair<GenomeLoc,DepthOfCoverageStats> intervalStats) { private void printGeneStats(List<Pair<GenomeLoc,DepthOfCoverageStats>> statsByTarget) {
LocationAwareSeekableRODIterator refseqIterator = initializeRefSeq();
List<Pair<String,DepthOfCoverageStats>> statsByGene = new ArrayList<Pair<String,DepthOfCoverageStats>>();// maintains order
Map<String,DepthOfCoverageStats> geneNamesToStats = new HashMap<String,DepthOfCoverageStats>(); // alows indirect updating of objects in list
for ( Pair<GenomeLoc,DepthOfCoverageStats> targetStats : statsByTarget ) {
String gene = getGeneName(targetStats.first,refseqIterator);
if ( geneNamesToStats.keySet().contains(gene) ) {
geneNamesToStats.get(gene).merge(targetStats.second);
} else {
geneNamesToStats.put(gene,targetStats.second);
statsByGene.add(new Pair<String,DepthOfCoverageStats>(gene,targetStats.second));
}
}
PrintStream geneSummaryOut = getCorrectStream(out,deriveFromStream("gene_summary"));
for ( Pair<String,DepthOfCoverageStats> geneStats : statsByGene ) {
printTargetSummary(geneSummaryOut,geneStats);
}
if ( ! getToolkit().getArguments().outFileName.contains("stdout")) {
geneSummaryOut.close();
}
}
//blatantly stolen from Andrew Kernytsky
private String getGeneName(GenomeLoc target, LocationAwareSeekableRODIterator refseqIterator) {
if (refseqIterator == null) { return "UNKNOWN"; }
RODRecordList annotationList = refseqIterator.seekForward(target);
if (annotationList == null) { return "UNKNOWN"; }
for(ReferenceOrderedDatum rec : annotationList) {
if ( ((rodRefSeq)rec).overlapsExonP(target) ) {
return ((rodRefSeq)rec).getGeneName();
}
}
return "UNKNOWN";
}
private LocationAwareSeekableRODIterator initializeRefSeq() {
ReferenceOrderedData<rodRefSeq> refseq = new ReferenceOrderedData<rodRefSeq>("refseq",
refSeqGeneList, rodRefSeq.class);
return refseq.iterator();
}
private void printTargetSummary(PrintStream output, Pair<?,DepthOfCoverageStats> intervalStats) {
DepthOfCoverageStats stats = intervalStats.second; DepthOfCoverageStats stats = intervalStats.second;
int[] bins = stats.getEndpoints(); int[] bins = stats.getEndpoints();
StringBuilder targetSummary = new StringBuilder(); StringBuilder targetSummary = new StringBuilder();
@ -452,10 +517,11 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, DepthOfCo
private void printSummary(PrintStream out, File optionalFile, DepthOfCoverageStats stats) { private void printSummary(PrintStream out, File optionalFile, DepthOfCoverageStats stats) {
PrintStream output = getCorrectStream(out,optionalFile); PrintStream output = getCorrectStream(out,optionalFile);
output.printf("%s\t%s\t%s\t%s\t%s%n","sample_id","mean","granular_third_quartile","granular_median","granular_first_quartile"); output.printf("%s\t%s\t%s\t%s\t%s\t%s%n","sample_id","total","mean","granular_third_quartile","granular_median","granular_first_quartile");
Map<String,int[]> histograms = stats.getHistograms(); Map<String,int[]> histograms = stats.getHistograms();
Map<String,Double> means = stats.getMeans(); Map<String,Double> means = stats.getMeans();
Map<String,Long> totals = stats.getTotals();
int[] leftEnds = stats.getEndpoints(); int[] leftEnds = stats.getEndpoints();
for ( String s : histograms.keySet() ) { for ( String s : histograms.keySet() ) {
@ -467,10 +533,10 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, DepthOfCo
median = median == histogram.length-1 ? histogram.length-2 : median; median = median == histogram.length-1 ? histogram.length-2 : median;
q1 = q1 == histogram.length-1 ? histogram.length-2 : q1; q1 = q1 == histogram.length-1 ? histogram.length-2 : q1;
q3 = q3 == histogram.length-1 ? histogram.length-2 : q3; q3 = q3 == histogram.length-1 ? histogram.length-2 : q3;
output.printf("%s\t%.2f\t%d\t%d\t%d%n",s,means.get(s),leftEnds[q3],leftEnds[median],leftEnds[q1]); output.printf("%s\t%d\t%.2f\t%d\t%d\t%d%n",s,totals.get(s),means.get(s),leftEnds[q3],leftEnds[median],leftEnds[q1]);
} }
output.printf("%s\t%.2f\t%s\t%s\t%s%n","Total",stats.getTotalMeanCoverage(),"N/A","N/A","N/A"); output.printf("%s\t%d\t%.2f\t%s\t%s\t%s%n","Total",stats.getTotalCoverage(),stats.getTotalMeanCoverage(),"N/A","N/A","N/A");
} }
private int getQuantile(int[] histogram, double prop) { private int getQuantile(int[] histogram, double prop) {
@ -507,6 +573,7 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, DepthOfCo
} }
// remember -- genome locus was printed in map() // remember -- genome locus was printed in map()
stream.printf("\t%d\t%.2f\t%s%n",tDepth,( (double) tDepth/ (double) allSamples.size()), perSampleOutput); stream.printf("\t%d\t%.2f\t%s%n",tDepth,( (double) tDepth/ (double) allSamples.size()), perSampleOutput);
//System.out.printf("\t%d\t%.2f\t%s%n",tDepth,( (double) tDepth/ (double) allSamples.size()), perSampleOutput);
} }

View File

@ -4,7 +4,9 @@ import org.broadinstitute.sting.WalkerTest;
import org.junit.Test; import org.junit.Test;
import java.io.File; import java.io.File;
import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List;
/** /**
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
@ -14,11 +16,26 @@ import java.util.Arrays;
*/ */
public class CoverageStatisticsIntegrationTest extends WalkerTest { public class CoverageStatisticsIntegrationTest extends WalkerTest {
private boolean RUN_TESTS = true; private boolean RUN_TESTS = false;
private String root = "-T CoverageStatistics "; private String root = "-T CoverageStatistics ";
private String hg18 = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta";
private String b36 = "/broad/1KG/reference/human_b36_both.fasta";
private String buildRootCmd(String ref, String bam, String interval) { private String buildRootCmd(String ref, List<String> bams, List<String> intervals) {
return root + "-R "+ref+" -I "+bam+" -L "+interval; StringBuilder bamBuilder = new StringBuilder();
do {
bamBuilder.append(" -I ");
bamBuilder.append(bams.remove(0));
} while ( bams.size() > 0 );
StringBuilder intervalBuilder = new StringBuilder();
do {
intervalBuilder.append(" -L ");
intervalBuilder.append(intervals.remove(0));
} while ( intervals.size() > 0 );
return root + "-R "+ref+bamBuilder.toString()+intervalBuilder.toString();
} }
private void execute(String name, WalkerTestSpec spec) { private void execute(String name, WalkerTestSpec spec) {
@ -30,19 +47,23 @@ public class CoverageStatisticsIntegrationTest extends WalkerTest {
@Test @Test
public void testBaseOutputNoFiltering() { public void testBaseOutputNoFiltering() {
// our base file // our base file
File baseOutputFile = this.createTempFile("outputtemp",".tmp"); File baseOutputFile = this.createTempFile("depthofcoveragenofiltering",".tmp");
this.setOutputFileLocation(baseOutputFile); this.setOutputFileLocation(baseOutputFile);
String bam_file = "/humgen/gsa-hphome1/chartl/projects/depthOfCoverage/testFiles/bams/Ciliopathies_1_88534_3_samples.bam"; String[] intervals = {"1:10,000,000-10,000,800","1:10,250,001-10,250,500","1:10,500,001-10,500,300","1:10,750,001-10,750,400"};
String interval_list = "chr1:855534"; String[] bams = {"/humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam","/broad/1KG/DCC_merged/freeze5/NA19240.pilot2.454.bam"};
String reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta";
String cmd = buildRootCmd(reference,bam_file,interval_list) + " -mmq 0 -mbq 0 -omitSampleSummary -omitLocus"; String cmd = buildRootCmd(b36,new ArrayList<String>(Arrays.asList(bams)),new ArrayList<String>(Arrays.asList(intervals))) + " -mmq 0 -mbq 0 -dels -baseCounts";
String expected = "d41d8cd98f00b204e9800998ecf8427e"; String expected = "cb87d6069ac60c73f047efc6d9386619";
WalkerTestSpec spec = new WalkerTestSpec(cmd,1, Arrays.asList(expected)); WalkerTestSpec spec = new WalkerTestSpec(cmd,1, Arrays.asList(expected));
// now add the expected files that get generated // now add the expected files that get generated
spec.addAuxFile("344936e0bb4613544ff137cd1a002d6c",new File(baseOutputFile.getAbsolutePath() + ".interval_statistics")); spec.addAuxFile("aff2349d6dc221c08f6c469379aeaedf", new File(baseOutputFile.getAbsolutePath() + ".interval_statistics"));
spec.addAuxFile("6476ed0c54a4307a618aa6d3268b050f", new File(baseOutputFile.getAbsolutePath()+".interval_summary"));
spec.addAuxFile("c744a298b7541f3f823e6937e9a0bc67", new File(baseOutputFile.getAbsolutePath()+".locus_statistics"));
spec.addAuxFile("65318c1e73d98a59cc6f817cde12d3d4", new File(baseOutputFile.getAbsolutePath()+".summary_statistics"));
spec.addAuxFile("9fc19f773a7ddfbb473d124e675a3d94", new File(baseOutputFile.getAbsolutePath()+".sample_statistics"));
if ( RUN_TESTS )
execute("testBaseOutputNoFiltering",spec); execute("testBaseOutputNoFiltering",spec);
} }
} }