Changes to DepthOfCoverage (JIRA items) and added back an integration test to cover it. Alterations to the design file generator to output all transcripts (rather than choosing one at random).

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3366 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-05-17 17:23:00 +00:00
parent 4235164359
commit b7d21627ab
4 changed files with 106 additions and 83 deletions

View File

@ -77,6 +77,10 @@ public class rodRefSeq extends BasicReferenceOrderedDatum implements Transcript
return numbers;
}
public String getTranscriptUniqueGeneName() {
return String.format("%s(%s)",getGeneName(),getTranscriptId());
}
public String getOverlapString(GenomeLoc position) {
boolean is_exon = false;
StringBuilder overlapString = new StringBuilder();

View File

@ -61,13 +61,6 @@ import java.util.*;
* @Author chartl
* @Date Feb 22, 2010
*/
// todo [DONE] -- add per target (e.g. regional) aggregation
// todo [DONE] -- add ability to print out the calculated bins and quit (for pre-analysis bin size selection)
// todo [DONE] -- refactor the location of the ALL_SAMPLE metrics [keep out of the per-sample HashMaps]
// todo [DONE] -- per locus output through -o
// todo [DONE] -- support for using read groups instead of samples
// todo [DONE] -- coverage including deletions
// todo [DONE] -- base counts
// todo -- cache the map from sample names to means in the print functions, rather than regenerating each time
// todo -- support for granular histograms for total depth; maybe n*[start,stop], bins*sqrt(n)
// todo -- alter logarithmic scaling to spread out bins more
@ -81,9 +74,9 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
int stop = 500;
@Argument(fullName = "nBins", doc = "Number of bins to use for granular binning", required = false)
int nBins = 499;
@Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth. Defaults to 50.", required = false)
@Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth. Defaults to -1.", required = false)
byte minMappingQuality = -1;
@Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to 20.", required = false)
@Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to -1.", required = false)
byte minBaseQuality = -1;
@Argument(fullName = "printBaseCounts", shortName = "baseCounts", doc = "Will add base counts to per-locus output.", required = false)
boolean printBaseCounts = false;
@ -107,6 +100,11 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
boolean ignoreDeletionSites = false;
@Argument(fullName = "calculateCoverageOverGenes", shortName = "geneList", doc = "Calculate the coverage statistics over this list of genes. Currently accepts RefSeq.", required = false)
File refSeqGeneList = null;
@Argument(fullName = "outputFormat", doc = "the format of the output file (e.g. csv, table, rtable); defaults to r-readable table", required = false)
String outputFormat = "rtable";
String[] OUTPUT_FORMATS = {"table","rtable","csv"};
String separator = "\t";
////////////////////////////////////////////////////////////////////////////////////
// STANDARD WALKER METHODS
@ -126,6 +124,20 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
System.exit(0);
}
boolean goodOutputFormat = false;
for ( String f : OUTPUT_FORMATS ) {
goodOutputFormat = goodOutputFormat || f.equals(outputFormat);
}
if ( ! goodOutputFormat ) {
System.out.println("Improper output format. Can be one of table,rtable,csv. Was "+outputFormat);
System.exit(0);
}
if ( outputFormat.equals("csv") ) {
separator = ",";
}
if ( getToolkit().getArguments().outFileName == null ) {
logger.warn("This walker creates many output files from one input file; you may wish to specify an input file rather "+
"than defaulting all output to stdout.");
@ -282,30 +294,31 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
throw new StingException("Unable to open interval file on reduce", e);
}
Pair<GenomeLoc,CoverageAggregator> firstPair = statsByInterval.get(0);
CoverageAggregator firstAggregator = firstPair.second;
DepthOfCoverageStats firstStats = isSample ? firstAggregator.getCoverageBySample() : firstAggregator.getCoverageByReadGroup();
StringBuilder summaryHeader = new StringBuilder();
summaryHeader.append("Target");
summaryHeader.append("\ttotal_coverage");
summaryHeader.append("\taverage_coverage");
summaryHeader.append(separator);
summaryHeader.append("total_coverage");
summaryHeader.append(separator);
summaryHeader.append("average_coverage");
for ( String s : firstStats.getAllSamples() ) {
summaryHeader.append("\t");
summaryHeader.append(separator);
summaryHeader.append(s);
summaryHeader.append("_total_cvg");
summaryHeader.append("\t");
summaryHeader.append(separator);
summaryHeader.append(s);
summaryHeader.append("_mean_cvg");
summaryHeader.append("\t");
summaryHeader.append(separator);
summaryHeader.append(s);
summaryHeader.append("_granular_Q1");
summaryHeader.append("\t");
summaryHeader.append(separator);
summaryHeader.append(s);
summaryHeader.append("_granular_median");
summaryHeader.append("\t");
summaryHeader.append(separator);
summaryHeader.append(s);
summaryHeader.append("_granular_Q3");
}
@ -386,26 +399,27 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
private void printTargetSummary(PrintStream output, Pair<?,DepthOfCoverageStats> intervalStats) {
DepthOfCoverageStats stats = intervalStats.second;
int[] bins = stats.getEndpoints();
StringBuilder targetSummary = new StringBuilder();
targetSummary.append(intervalStats.first.toString());
targetSummary.append("\t");
targetSummary.append(separator);
targetSummary.append(stats.getTotalCoverage());
targetSummary.append("\t");
targetSummary.append(separator);
targetSummary.append(String.format("%.2f",stats.getTotalMeanCoverage()));
for ( String s : stats.getAllSamples() ) {
targetSummary.append("\t");
targetSummary.append(separator);
targetSummary.append(stats.getTotals().get(s));
targetSummary.append("\t");
targetSummary.append(separator);
targetSummary.append(String.format("%.2f", stats.getMeans().get(s)));
targetSummary.append("\t");
targetSummary.append(separator);
int median = getQuantile(stats.getHistograms().get(s),0.5);
int q1 = getQuantile(stats.getHistograms().get(s),0.25);
int q3 = getQuantile(stats.getHistograms().get(s),0.75);
targetSummary.append(formatBin(bins,q1));
targetSummary.append("\t");
targetSummary.append(separator);
targetSummary.append(formatBin(bins,median));
targetSummary.append("\t");
targetSummary.append(separator);
targetSummary.append(formatBin(bins,q3));
}
@ -424,16 +438,17 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
}
private void printIntervalTable(PrintStream output, int[][] intervalTable, int[] cutoffs) {
output.printf("\tdepth>=%d",0);
String colHeader = outputFormat.equals("rtable") ? "" : "Number_of_sources";
output.printf(colHeader + separator+"depth>=%d",0);
for ( int col = 0; col < intervalTable[0].length-1; col ++ ) {
output.printf("\tdepth>=%d",cutoffs[col]);
output.printf(separator+"depth>=%d",cutoffs[col]);
}
output.printf(String.format("%n"));
for ( int row = 0; row < intervalTable.length; row ++ ) {
output.printf("At_least_%d_samples",row+1);
for ( int col = 0; col < intervalTable[0].length; col++ ) {
output.printf("\t%d",intervalTable[row][col]);
output.printf(separator+"%d",intervalTable[row][col]);
}
output.printf(String.format("%n"));
}
@ -503,13 +518,13 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
if ( ! useReadGroup || useBoth ) {
File perLocusStatisticsFile = deriveFromStream("sample_locus_statistics");
File perLocusCoverageFile = deriveFromStream("sample_coverage_statistics");
printPerLocus(perLocusStatisticsFile,perLocusCoverageFile,coverageProfiles.getCoverageBySample());
printPerLocus(perLocusStatisticsFile,perLocusCoverageFile,coverageProfiles.getCoverageBySample(),false);
}
if ( useReadGroup || useBoth ) {
File perLocusRGStats = deriveFromStream("read_group_locus_statistics");
File perLocusRGCoverage = deriveFromStream("read_group_locus_coverage");
printPerLocus(perLocusRGStats,perLocusRGCoverage,coverageProfiles.getCoverageByReadGroup());
printPerLocus(perLocusRGStats,perLocusRGCoverage,coverageProfiles.getCoverageByReadGroup(),true);
}
}
}
@ -530,11 +545,15 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
private void printPerSample(PrintStream out, File optionalFile, DepthOfCoverageStats stats) {
PrintStream output = getCorrectStream(out,optionalFile);
int[] leftEnds = stats.getEndpoints();
StringBuilder hBuilder = new StringBuilder();
hBuilder.append("\t");
hBuilder.append(String.format("from_0_to_%d)\t",leftEnds[0]));
if ( ! outputFormat.equals("rTable")) {
hBuilder.append("Source_of_reads");
}
hBuilder.append(separator);
hBuilder.append(String.format("from_0_to_%d)%s",leftEnds[0],separator));
for ( int i = 1; i < leftEnds.length; i++ )
hBuilder.append(String.format("from_%d_to_%d)\t",leftEnds[i-1],leftEnds[i]));
hBuilder.append(String.format("from_%d_to_%d)%s",leftEnds[i-1],leftEnds[i],separator));
hBuilder.append(String.format("from_%d_to_inf%n",leftEnds[leftEnds.length-1]));
output.print(hBuilder.toString());
Map<String,int[]> histograms = stats.getHistograms();
@ -542,14 +561,14 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
StringBuilder sBuilder = new StringBuilder();
sBuilder.append(String.format("sample_%s",s));
for ( int count : histograms.get(s) ) {
sBuilder.append(String.format("\t%d",count));
sBuilder.append(String.format("%s%d",separator,count));
}
sBuilder.append(String.format("%n"));
output.print(sBuilder.toString());
}
}
private void printPerLocus(File locusFile, File coverageFile, DepthOfCoverageStats stats) {
private void printPerLocus(File locusFile, File coverageFile, DepthOfCoverageStats stats, boolean isRG) {
PrintStream output = getCorrectStream(out,locusFile);
PrintStream coverageOut = getCorrectStream(out,coverageFile);
if ( output == null ) {
@ -564,10 +583,19 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
// rows - # of samples
// columns - depth of coverage
boolean printSampleColumnHeader = outputFormat.equals("csv") || outputFormat.equals("table");
StringBuilder header = new StringBuilder();
header.append(String.format("\tgte_0"));
if ( printSampleColumnHeader ) {
if ( isRG ) {
header.append("ReadGroup");
} else {
header.append("Sample");
}
}
header.append(String.format("%sgte_0",separator));
for ( int d : endpoints ) {
header.append(String.format("\tgte_%d",d));
header.append(String.format("%sgte_%d",separator,d));
}
header.append(String.format("%n"));
@ -577,7 +605,7 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
for ( int row = 0; row < samples; row ++ ) {
output.printf("%s_%d","NSamples",row+1);
for ( int depthBin = 0; depthBin < baseCoverageCumDist[0].length; depthBin ++ ) {
output.printf("\t%d",baseCoverageCumDist[row][depthBin]);
output.printf("%s%d",separator,baseCoverageCumDist[row][depthBin]);
}
output.printf("%n");
}
@ -586,7 +614,7 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
coverageOut.printf("%s",sample);
double[] coverageDistribution = stats.getCoverageProportions(sample);
for ( int bin = 0; bin < coverageDistribution.length; bin ++ ) {
coverageOut.printf("\t%.2f",coverageDistribution[bin]);
coverageOut.printf("%s%.2f",separator,coverageDistribution[bin]);
}
coverageOut.printf("%n");
}
@ -610,8 +638,11 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
private void printSummary(PrintStream out, File optionalFile, DepthOfCoverageStats stats) {
PrintStream output = getCorrectStream(out,optionalFile);
output.printf("%s\t%s\t%s\t%s\t%s\t%s%n","sample_id","total","mean","granular_third_quartile","granular_median","granular_first_quartile");
if ( ! outputFormat.equals("csv") ) {
output.printf("%s\t%s\t%s\t%s\t%s\t%s%n","sample_id","total","mean","granular_third_quartile","granular_median","granular_first_quartile");
} else {
output.printf("%s,%s,%s,%s,%s,%s%n","sample_id","total","mean","granular_third_quartile","granular_median","granular_first_quartile");
}
Map<String,int[]> histograms = stats.getHistograms();
Map<String,Double> means = stats.getMeans();
@ -627,10 +658,18 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
median = median == histogram.length-1 ? histogram.length-2 : median;
q1 = q1 == histogram.length-1 ? histogram.length-2 : q1;
q3 = q3 == histogram.length-1 ? histogram.length-2 : q3;
output.printf("%s\t%d\t%.2f\t%d\t%d\t%d%n",s,totals.get(s),means.get(s),leftEnds[q3],leftEnds[median],leftEnds[q1]);
if ( ! outputFormat.equals("csv") ) {
output.printf("%s\t%d\t%.2f\t%d\t%d\t%d%n",s,totals.get(s),means.get(s),leftEnds[q3],leftEnds[median],leftEnds[q1]);
} else {
output.printf("%s,%d,%.2f,%d,%d,%d%n",s,totals.get(s),means.get(s),leftEnds[q3],leftEnds[median],leftEnds[q1]);
}
}
output.printf("%s\t%d\t%.2f\t%s\t%s\t%s%n","Total",stats.getTotalCoverage(),stats.getTotalMeanCoverage(),"N/A","N/A","N/A");
if ( ! outputFormat.equals("csv") ) {
output.printf("%s\t%d\t%.2f\t%s\t%s\t%s%n","Total",stats.getTotalCoverage(),stats.getTotalMeanCoverage(),"N/A","N/A","N/A");
} else {
output.printf("%s,%d,%.2f,%s,%s,%s%n","Total",stats.getTotalCoverage(),stats.getTotalMeanCoverage(),"N/A","N/A","N/A");
}
}
private int getQuantile(int[] histogram, double prop) {
@ -656,19 +695,18 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
StringBuilder perSampleOutput = new StringBuilder();
int tDepth = 0;
for ( String s : allSamples ) {
perSampleOutput.append("\t");
perSampleOutput.append(separator);
long dp = countsBySample.keySet().contains(s) ? sumArray(countsBySample.get(s)) : 0;
perSampleOutput.append(dp);
if ( printBaseCounts ) {
perSampleOutput.append("\t");
perSampleOutput.append(separator);
perSampleOutput.append(baseCounts(countsBySample.get(s)));
}
tDepth += dp;
}
// remember -- genome locus was printed in map()
stream.printf("\t%d\t%.2f\t%s%n",tDepth,( (double) tDepth/ (double) allSamples.size()), perSampleOutput);
//System.out.printf("\t%d\t%.2f\t%s%n",tDepth,( (double) tDepth/ (double) allSamples.size()), perSampleOutput);
stream.printf("%s%d%s%.2f%s%n",separator,tDepth,separator,( (double) tDepth/ (double) allSamples.size()),
perSampleOutput);
}
private long sumArray(int[] array) {

View File

@ -86,9 +86,9 @@ public class DesignFileGeneratorWalker extends RodWalker<Long,Long> {
for ( GenomeLoc interval : intervalBuffer.keySet() ) {
for ( rodRefSeq refseq : refseqBuffer ) {
if ( interval.overlapsP(refseq.getLocation()) &&
! intervalBuffer.get(interval).geneNames.contains(refseq.getGeneName()) ) {
! intervalBuffer.get(interval).geneNames.contains(refseq.getTranscriptUniqueGeneName()) ) {
// if the interval overlaps the gene transcript; and the gene is not already represented in the interval
intervalBuffer.get(interval).update(refseq.getGeneName(),
intervalBuffer.get(interval).update(refseq.getTranscriptUniqueGeneName(),
refseq.getExonsInInterval(interval),
refseq.getExonNumbersInInterval(interval));
nUpdate++;

View File

@ -50,45 +50,26 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest {
File baseOutputFile = this.createTempFile("depthofcoveragenofiltering",".tmp");
this.setOutputFileLocation(baseOutputFile);
String[] intervals = {"1:10,000,000-10,000,800","1:10,250,001-10,250,500","1:10,500,001-10,500,300","1:10,750,001-10,750,400"};
String[] bams = {"/humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam","/broad/1KG/DCC_merged/freeze5/NA19240.pilot2.454.bam"};
String[] intervals = {"/humgen/gsa-hpprojects/GATK/data/Validation_Data/fhs_jhs_30_targts.interval_list"};
String[] bams = {"/humgen/gsa-hpprojects/GATK/data/Validation_Data/FHS_indexed_subset.bam"};
String cmd = buildRootCmd(b36,new ArrayList<String>(Arrays.asList(bams)),new ArrayList<String>(Arrays.asList(intervals))) + " -mmq 0 -mbq 0 -dels -baseCounts -both";
String cmd = buildRootCmd(hg18,new ArrayList<String>(Arrays.asList(bams)),new ArrayList<String>(Arrays.asList(intervals))) + " -mmq 0 -mbq 0 -dels -baseCounts -both --outputFormat csv";
WalkerTestSpec spec = new WalkerTestSpec(cmd,0, new ArrayList<String>());
// now add the expected files that get generated
spec.addAuxFile("959937a9b0ace520b4b7d9915d708003", baseOutputFile);
spec.addAuxFile("aff2349d6dc221c08f6c469379aeaedf", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_statistics"));
spec.addAuxFile("6476ed0c54a4307a618aa6d3268b050f", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary"));
spec.addAuxFile("faf63982622764299c734299ffeb3c63", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_locus_statistics"));
spec.addAuxFile("65318c1e73d98a59cc6f817cde12d3d4", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary_statistics"));
spec.addAuxFile("ef8c3e2ba3fc0da829e10e2d487c00d2", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics"));
spec.addAuxFile("223377e07b35e81a394b75b38d8e72ee", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_statistics"));
spec.addAuxFile("096f4ed94020327288ea76245ebd6942", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_summary"));
spec.addAuxFile("8e391b7b84fc19dff83002b6dd9773d8", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_locus_statistics"));
spec.addAuxFile("43c160ff9d754744728c142709011993", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_statistics"));
spec.addAuxFile("a374410efe20609c5c4b87a6da7f4d51", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_summary"));
spec.addAuxFile("f53ddf25c2b71e46381f9c434402d88d", baseOutputFile);
spec.addAuxFile("925cc5b49286e0222bce6251d1baafc7", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_statistics"));
spec.addAuxFile("d9e812398d778f28ed12d7f3d18628e2", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary"));
spec.addAuxFile("80577bf378de570f84d91b0ef6004102", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_locus_statistics"));
spec.addAuxFile("3a059ad82d945643dc4e03239c4041f5", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary_statistics"));
spec.addAuxFile("f3315551081331bc322c53b11412d707", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics"));
spec.addAuxFile("7b9d0e93bf5b5313995be7010ef1f528", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_statistics"));
spec.addAuxFile("fd29fe0c14351e934a6fef9df1f38f7b", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_summary"));
spec.addAuxFile("111261f0e8ccf8c456d0b2a9482bc32c", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_locus_statistics"));
spec.addAuxFile("cc7ee5075a932dba486e78824ca34202", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_statistics"));
spec.addAuxFile("e1653480daa2d96f7c584ed4cd20c147", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_summary"));
//execute("testBaseOutputNoFiltering",spec);
}
@Test
public void testMedianOverRightHandBin() {
File base = this.createTempFile("depthofcoveragelowbins",".tmp");
this.setOutputFileLocation(base);
String[] intervals = {"1:10,000,000-10,000,800","1:10,250,001-10,250,500","1:10,500,001-10,500,300","1:10,750,001-10,750,400"};
String[] bams = {"/humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam","/broad/1KG/DCC_merged/freeze5/NA19240.pilot2.454.bam"};
String cmd = buildRootCmd(b36,new ArrayList<String>(Arrays.asList(bams)),new ArrayList<String>(Arrays.asList(intervals))) +
" -mmq 0 -mbq 0 -dels -baseCounts -both --start 1 --stop 14 --nBins 13";
WalkerTestSpec spec = new WalkerTestSpec(cmd,0, new ArrayList<String>());
spec.addAuxFile("959937a9b0ace520b4b7d9915d708003", base);
spec.addAuxFile("219d643627eedd696bc476aac96376c2", createTempFileFromBase(base.getAbsolutePath()+".read_group_interval_statistics"));
spec.addAuxFile("dd0225cf1e0b0bd4289b82fd4939f9fd", createTempFileFromBase(base.getAbsolutePath()+".sample_interval_statistics"));
spec.addAuxFile("63575a8a2110507e08d421d44d06b327", createTempFileFromBase(base.getAbsolutePath()+".sample_interval_summary"));
//execute("testMedianOverRHBin",spec);
execute("testBaseOutputNoFiltering",spec);
}
public File createTempFileFromBase(String name) {