Moved CoverageStatistics to core. This will be (soon) renamed DepthOfCoverage; so please use CoverageStatistics

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3090 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-03-29 13:32:00 +00:00
parent 1e8b3ca6ba
commit dc802aa26f
8 changed files with 33 additions and 29 deletions

View File

@ -43,11 +43,11 @@ PlotLocusQuantiles <- function(X) {
medians = matrix(nrow=1,ncol=ncol(Z)) medians = matrix(nrow=1,ncol=ncol(Z))
quan90 = matrix(nrow=1,ncol=ncol(Z)) quan90 = matrix(nrow=1,ncol=ncol(Z))
for ( cc in 1:ncol(Z) ) { for ( cc in 1:ncol(Z) ) {
medians[cc] = median(Z[,cc]) medians[cc] = quantile(Z[,cc],0.75)
quan90[cc] = quantile(Z[,cc],0.9) quan90[cc] = quantile(Z[,cc],1)
} }
plot(t(medians),xlab="",xaxt="n",ylab="Proportion of loci with >X coverage",type="b",col="blue") plot(t(medians),xlab="",xaxt="n",ylab="Proportion of loci with >X coverage",type="b",col="blue",yaxp=c(0,1,10))
axis(1,labels=FALSE) axis(1,labels=FALSE)
parseColNames <- function(K) { parseColNames <- function(K) {
M = matrix(nrow=1,ncol=length(K)) M = matrix(nrow=1,ncol=length(K))
@ -63,7 +63,7 @@ PlotLocusQuantiles <- function(X) {
labels <- parseColNames(colnames(X)) labels <- parseColNames(colnames(X))
text(1:length(labels),par("usr")[3]-0.025,srt=90,adj=1,labels=labels,xpd=TRUE,cex=(0.8/32)*length(labels),lheight=(0.8/32)*length(labels)) text(1:length(labels),par("usr")[3]-0.025,srt=90,adj=1,labels=labels,xpd=TRUE,cex=(0.8/32)*length(labels),lheight=(0.8/32)*length(labels))
points(t(quan90),type="b",col="red") points(t(quan90),type="b",col="red")
legend(x=floor(0.6*length(labels)),y=1,c("50% of samples","90% of samples"),col=c("red","blue"),lty=c(1,1)) legend(x=floor(0.6*length(labels)),y=1,c("75% of samples","100% of samples"),col=c("red","blue"),lty=c(1,1))
dev.off() dev.off()
} }

View File

@ -1,9 +1,8 @@
package org.broadinstitute.sting.oneoffprojects.walkers.coverage; package org.broadinstitute.sting.gatk.walkers.coverage;
import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMReadGroupRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
@ -14,13 +13,13 @@ import org.broadinstitute.sting.gatk.walkers.By;
import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.gatk.walkers.coverage.DepthOfCoverageWalker; import org.broadinstitute.sting.oneoffprojects.walkers.coverage.CoverageUtils;
import org.broadinstitute.sting.oneoffprojects.walkers.coverage.DepthOfCoverageStats;
import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Pair; import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
@ -127,7 +126,7 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, CoverageA
out.printf("Per-Locus Depth of Coverage output was omitted"); out.printf("Per-Locus Depth of Coverage output was omitted");
} }
} }
private HashSet<String> getSamplesFromToolKit( boolean getReadGroupsInstead ) { private HashSet<String> getSamplesFromToolKit( boolean getReadGroupsInstead ) {
HashSet<String> partitions = new HashSet<String>(); // since the DOCS object uses a HashMap, this will be in the same order HashSet<String> partitions = new HashSet<String>(); // since the DOCS object uses a HashMap, this will be in the same order
@ -151,8 +150,13 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, CoverageA
} }
public CoverageAggregator reduceInit() { public CoverageAggregator reduceInit() {
CoverageAggregator.AggregationType agType = useBoth ? CoverageAggregator.AggregationType.BOTH :
( useReadGroup ? CoverageAggregator.AggregationType.READ : CoverageAggregator.AggregationType.SAMPLE ) ; CoverageAggregator.AggregationType agType;
if ( useBoth ) {
agType = CoverageAggregator.AggregationType.BOTH;
} else {
agType = useReadGroup ? CoverageAggregator.AggregationType.READ : CoverageAggregator.AggregationType.SAMPLE;
}
CoverageAggregator aggro = new CoverageAggregator(agType,start,stop,nBins); CoverageAggregator aggro = new CoverageAggregator(agType,start,stop,nBins);
@ -170,7 +174,7 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, CoverageA
} }
public Map<String,int[]> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { public Map<String,int[]> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( ! omitDepthOutput ) { if ( ! omitDepthOutput ) {
out.printf("%s",ref.getLocus()); // yes: print locus in map, and the rest of the info in reduce (for eventual cumulatives) out.printf("%s",ref.getLocus()); // yes: print locus in map, and the rest of the info in reduce (for eventual cumulatives)
//System.out.printf("\t[log]\t%s",ref.getLocus()); //System.out.printf("\t[log]\t%s",ref.getLocus());
@ -327,7 +331,7 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, CoverageA
if ( ! getToolkit().getArguments().outFileName.contains("stdout")) { if ( ! getToolkit().getArguments().outFileName.contains("stdout")) {
geneSummaryOut.close(); geneSummaryOut.close();
} }
} }
//blatantly stolen from Andrew Kernytsky //blatantly stolen from Andrew Kernytsky
@ -450,7 +454,7 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, CoverageA
/////////////////// ///////////////////
// OPTIONAL OUTPUTS // OPTIONAL OUTPUTS
////////////////// //////////////////
if ( ! omitSampleSummary ) { if ( ! omitSampleSummary ) {
logger.info("Printing summary info"); logger.info("Printing summary info");
if ( ! useReadGroup || useBoth ) { if ( ! useReadGroup || useBoth ) {
@ -561,7 +565,7 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, CoverageA
output = out; output = out;
} }
} }
return output; return output;
} }
@ -606,7 +610,7 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, CoverageA
return bin; return bin;
} }
private void printDepths(PrintStream stream, Map<String,int[]> countsBySample, Set<String> allSamples) { private void printDepths(PrintStream stream, Map<String,int[]> countsBySample, Set<String> allSamples) {
// get the depths per sample and build up the output string while tabulating total and average coverage // get the depths per sample and build up the output string while tabulating total and average coverage
// todo -- update me to deal with base counts/indels // todo -- update me to deal with base counts/indels
@ -625,7 +629,7 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, CoverageA
// remember -- genome locus was printed in map() // remember -- genome locus was printed in map()
stream.printf("\t%d\t%.2f\t%s%n",tDepth,( (double) tDepth/ (double) allSamples.size()), perSampleOutput); stream.printf("\t%d\t%.2f\t%s%n",tDepth,( (double) tDepth/ (double) allSamples.size()), perSampleOutput);
//System.out.printf("\t%d\t%.2f\t%s%n",tDepth,( (double) tDepth/ (double) allSamples.size()), perSampleOutput); //System.out.printf("\t%d\t%.2f\t%s%n",tDepth,( (double) tDepth/ (double) allSamples.size()), perSampleOutput);
} }
private long sumArray(int[] array) { private long sumArray(int[] array) {
@ -658,7 +662,6 @@ public class CoverageStatistics extends LocusWalker<Map<String,int[]>, CoverageA
} }
} }
class CoverageAggregator { class CoverageAggregator {
private DepthOfCoverageStats coverageByRead; private DepthOfCoverageStats coverageByRead;
private DepthOfCoverageStats coverageBySample; private DepthOfCoverageStats coverageBySample;

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.oneoffprojects.walkers.coverage; package org.broadinstitute.sting.gatk.walkers.coverage;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.BaseUtils;

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.oneoffprojects.walkers.coverage; package org.broadinstitute.sting.gatk.walkers.coverage;
import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.BaseUtils;

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.oneoffprojects.walkers.coverage; package org.broadinstitute.sting.gatk.walkers;
import org.broadinstitute.sting.WalkerTest; import org.broadinstitute.sting.WalkerTest;
import org.junit.Test; import org.junit.Test;

View File

@ -10,7 +10,7 @@ hg18_dbsnp = "/humgen/gsa-hpprojects/GATK/data/dbsnp_130_hg18.rod"
b36_dbsnp = "/humgen/gsa-hpprojects/GATK/data/dbsnp_130_b36.rod" b36_dbsnp = "/humgen/gsa-hpprojects/GATK/data/dbsnp_130_b36.rod"
b36_reference = "/broad/1KG/reference/human_b36_both.fasta" b36_reference = "/broad/1KG/reference/human_b36_both.fasta"
hg18_intervals = "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.interval_list" hg18_intervals = "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.interval_list"
#hg18_intervals = "/humgen/gsa-hpprojects/FHS/indexed/interval_lists/fhs_jhs_pilot.targets.interval_list" hg18_intervals = "/humgen/gsa-hpprojects/FHS/indexed/interval_lists/fhs_jhs_pilot.targets.interval_list"
b36_intervals = "" b36_intervals = ""
min_base_q = "10" min_base_q = "10"
@ -37,8 +37,8 @@ else:
fpref = "human_b36" fpref = "human_b36"
outputFile = projectName+"_bam_files.txt" outputFile = projectName+"_bam_files.txt"
OUTPUT_HEADER = ["sample_id","recalibrated_bam_file","individual_id","fingerprint_file","reference_file","dbsnp_file","interval_list","max_reads_at_locus","min_confidence","min_mapping_quality","min_base_quality","variant_filter_expression","variant_filter_name"] OUTPUT_HEADER = ["sample_id","recalibrated_bam_file","individual_id","fingerprint_file","reference_file","interval_list","max_reads_at_locus","min_confidence","min_mapping_quality","min_base_quality","variant_filter_expression","variant_filter_name"]
OUTPUT_HEADER_INDIVIDUAL = ["reference_file","dbsnp_file","interval_list","max_reads_at_locus","min_confidence","min_mapping_quality","min_base_quality","variant_filter_expression","variant_filter_name"] OUTPUT_HEADER_INDIVIDUAL = ["reference_file","interval_list","max_reads_at_locus","min_confidence","min_mapping_quality","min_base_quality","variant_filter_expression","variant_filter_name"]
if ( spreadsheetPath.find("/") > -1 ): if ( spreadsheetPath.find("/") > -1 ):
newSpreadsheet = spreadsheetPath.rsplit("/",1)[1].rsplit(".",1)[0]+"_proper_format.tsv" newSpreadsheet = spreadsheetPath.rsplit("/",1)[1].rsplit(".",1)[0]+"_proper_format.tsv"
@ -81,7 +81,7 @@ for line in project_info.readlines():
else: else:
fingerprint_file = "" fingerprint_file = ""
if ( spline[status_index] == "Complete" ): if ( spline[status_index] == "Complete" ):
outputFile.write(projectName+"_"+spline[sample_index]+"\t"+bamfile+"\t"+groupName+"\t"+fingerprint_file+"\t"+reference+"\t"+dbsnp+"\t"+intervals+"\t"+max_reads+"\t"+min_conf+"\t"+min_map_q+"\t"+min_base_q+"\t"+variant_expression+"\t"+filter_name+"\n") outputFile.write(projectName+"_"+spline[sample_index]+"\t"+bamfile+"\t"+groupName+"\t"+fingerprint_file+"\t"+reference+"\t"+intervals+"\t"+max_reads+"\t"+min_conf+"\t"+min_map_q+"\t"+min_base_q+"\t"+variant_expression+"\t"+filter_name+"\n")
outputFile.close() outputFile.close()
outputFile = open(projectName+"_Project_Entry.txt",'w') outputFile = open(projectName+"_Project_Entry.txt",'w')
@ -90,4 +90,4 @@ outputFile.write(projectName)
outputFile.close() outputFile.close()
outputFile = open(projectName+"_Population_Entry.txt",'w') outputFile = open(projectName+"_Population_Entry.txt",'w')
outputFile.write("individual_id\tindividual_set_id\t"+"\t".join(OUTPUT_HEADER_INDIVIDUAL)+"\n") outputFile.write("individual_id\tindividual_set_id\t"+"\t".join(OUTPUT_HEADER_INDIVIDUAL)+"\n")
outputFile.write(groupName+"\t"+projectName+"\t"+reference+"\t"+dbsnp+"\t"+intervals+"\t"+max_reads+"\t"+min_conf+"\t"+min_base_q+"\t"+variant_expression+"\t"+filter_name+"\n") outputFile.write(groupName+"\t"+projectName+"\t"+reference+"\t"+intervals+"\t"+max_reads+"\t"+min_conf+"\t"+min_base_q+"\t"+variant_expression+"\t"+filter_name+"\n")

View File

@ -4,10 +4,11 @@ import sys
import os import os
bam_file = sys.argv[1] bam_file = sys.argv[1]
project = sys.argv[2] fingerprint_file = sys.argv[2]
project = sys.argv[3]
directory = bam_file.rsplit("/",1)[0]+"/" directory = bam_file.rsplit("/",1)[0]+"/"
sample_id = bam_file.rsplit("/",1)[1].rsplit(".",1)[0] sample_id = bam_file.rsplit("/",1)[1].rsplit(".",1)[0]
is_metrics = directory+sample_id+".insert_size_metrics" is_metrics = directory+sample_id+".insert_size_metrics"
his_metrics = directory+sample_id+".hybrid_selection_metrics" his_metrics = directory+sample_id+".hybrid_selection_metrics"
ali_metrics = directory+sample_id+".alignment_summary_metrics" ali_metrics = directory+sample_id+".alignment_summary_metrics"
os.system("zip -j "+project+"_"+sample_id+"_sequencing_metrics"+" "+is_metrics+" "+his_metrics+" "+ali_metrics) os.system("zip -j "+project+"_"+sample_id+"_sequencing_metrics"+" "+is_metrics+" "+his_metrics+" "+ali_metrics+" "+fingerprint_file)

View File

@ -1,4 +1,4 @@
#!/usr/bin/env Python #!/usr/bin/env python
import sys import sys