Major refactoring of Depth of Coverage to allow for more extensible partitions of data (now can do read group, sample, and library; in any combination; adding more is fairly easy). Changed the by-gene code to use clones of stats objects, rather than munging the interval DoCs. (Fix for Avinash. Who, hilariously, thinks my name is Carl.) Added sorting methods to ensure static ordering of header and body fields.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3377 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-05-18 16:58:13 +00:00
parent 2a212c497f
commit e016491a3d
4 changed files with 262 additions and 207 deletions

View File

@ -1,13 +1,15 @@
package org.broadinstitute.sting.gatk.walkers.coverage; package org.broadinstitute.sting.gatk.walkers.coverage;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement;
import java.util.HashMap; import java.util.HashMap;
import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set;
/** /**
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
@ -17,8 +19,6 @@ import java.util.Map;
*/ */
public class CoverageUtils { public class CoverageUtils {
public enum PartitionType { BY_READ_GROUP, BY_SAMPLE }
/** /**
* Returns the counts of bases from reads with MAPQ > minMapQ and base quality > minBaseQ in the context * Returns the counts of bases from reads with MAPQ > minMapQ and base quality > minBaseQ in the context
* as an array of ints, indexed by the index fields of BaseUtils * as an array of ints, indexed by the index fields of BaseUtils
@ -40,21 +40,37 @@ public class CoverageUtils {
return counts; return counts;
} }
public static String getTypeID( SAMRecord r, CoverageAggregator.AggregationType type ) {
if ( type == CoverageAggregator.AggregationType.SAMPLE ) {
return r.getReadGroup().getSample();
} else if ( type == CoverageAggregator.AggregationType.READGROUP ) {
return String.format("%s_rg_%s",r.getReadGroup().getSample(),r.getReadGroup().getReadGroupId());
} else if ( type == CoverageAggregator.AggregationType.LIBRARY ) {
return r.getReadGroup().getLibrary();
} else {
throw new StingException("Invalid type ID sent to getTypeID. This is a BUG!");
}
}
public static Map<CoverageAggregator.AggregationType,Map<String,int[]>>
getBaseCountsByPartition(AlignmentContext context, int minMapQ, int minBaseQ, List<CoverageAggregator.AggregationType> types) {
public static Map<String,int[]> getBaseCountsBySample(AlignmentContext context, int minMapQ, int minBaseQ, PartitionType type) { Map<CoverageAggregator.AggregationType,Map<String,int[]>> countsByIDByType = new HashMap<CoverageAggregator.AggregationType,Map<String,int[]>>();
Map<String,int[]> samplesToCounts = new HashMap<String,int[]>(); for (CoverageAggregator.AggregationType t : types ) {
countsByIDByType.put(t,new HashMap<String,int[]>());
}
for (PileupElement e : context.getBasePileup()) { for (PileupElement e : context.getBasePileup()) {
if ( e.getMappingQual() >= minMapQ && ( e.getQual() >= minBaseQ || e.isDeletion() ) ) { if ( e.getMappingQual() >= minMapQ && ( e.getQual() >= minBaseQ || e.isDeletion() ) ) {
String sample = type == PartitionType.BY_SAMPLE ? e.getRead().getReadGroup().getSample() : for (CoverageAggregator.AggregationType t : types ) {
e.getRead().getReadGroup().getSample()+"_rg_"+e.getRead().getReadGroup().getReadGroupId(); String id = getTypeID(e.getRead(),t);
if ( ! samplesToCounts.keySet().contains(sample) ) if ( ! countsByIDByType.get(t).keySet().contains(id) ) {
samplesToCounts.put(sample,new int[6]); countsByIDByType.get(t).put(id,new int[6]);
updateCounts(samplesToCounts.get(sample),e); }
updateCounts(countsByIDByType.get(t).get(id),e);
}
} }
} }
return samplesToCounts; return countsByIDByType;
} }
private static void updateCounts(int[] counts, PileupElement e) { private static void updateCounts(int[] counts, PileupElement e) {

View File

@ -84,6 +84,19 @@ public class DepthOfCoverageStats {
totalDepthOfCoverage = 0; totalDepthOfCoverage = 0;
} }
public DepthOfCoverageStats(DepthOfCoverageStats cloneMe) {
this.binLeftEndpoints = cloneMe.binLeftEndpoints;
this.granularHistogramBySample = cloneMe.granularHistogramBySample;
this.totalCoverages = cloneMe.totalCoverages;
this.nLoci = cloneMe.nLoci;
this.totalLocusDepth = cloneMe.totalLocusDepth;
this.totalDepthOfCoverage = cloneMe.totalDepthOfCoverage;
this.locusHistogram = cloneMe.locusHistogram;
this.locusCoverageCounts = cloneMe.locusCoverageCounts;
this.tabulateLocusCounts = cloneMe.tabulateLocusCounts;
this.includeDeletions = cloneMe.includeDeletions;
}
public void addSample(String sample) { public void addSample(String sample) {
if ( granularHistogramBySample.containsKey(sample) ) { if ( granularHistogramBySample.containsKey(sample) ) {
return; return;

View File

@ -67,7 +67,7 @@ import java.util.*;
// todo -- allow for user to set linear binning (default is logarithmic) // todo -- allow for user to set linear binning (default is logarithmic)
// todo -- formatting --> do something special for end bins in getQuantile(int[] foo), this gets mushed into the end+-1 bins for now // todo -- formatting --> do something special for end bins in getQuantile(int[] foo), this gets mushed into the end+-1 bins for now
@By(DataSource.REFERENCE) @By(DataSource.REFERENCE)
public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, CoverageAggregator> implements TreeReducible<CoverageAggregator> { public class DepthOfCoverageWalker extends LocusWalker<Map<CoverageAggregator.AggregationType,Map<String,int[]>>, CoverageAggregator> implements TreeReducible<CoverageAggregator> {
@Argument(fullName = "start", doc = "Starting (left endpoint) for granular binning", required = false) @Argument(fullName = "start", doc = "Starting (left endpoint) for granular binning", required = false)
int start = 1; int start = 1;
@Argument(fullName = "stop", doc = "Ending (right endpoint) for granular binning", required = false) @Argument(fullName = "stop", doc = "Ending (right endpoint) for granular binning", required = false)
@ -90,10 +90,8 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
boolean printBinEndpointsAndExit = false; boolean printBinEndpointsAndExit = false;
@Argument(fullName = "omitPerSampleStats", shortName = "omitSampleSummary", doc = "Omits the summary files per-sample. These statistics are still calculated, so this argument will not improve runtime.", required = false) @Argument(fullName = "omitPerSampleStats", shortName = "omitSampleSummary", doc = "Omits the summary files per-sample. These statistics are still calculated, so this argument will not improve runtime.", required = false)
boolean omitSampleSummary = false; boolean omitSampleSummary = false;
@Argument(fullName = "useReadGroups", shortName = "rg", doc = "Split depth of coverage output by read group rather than by sample", required = false) @Argument(fullName = "partitionType", shortName = "pt", doc = "Partition type for depth of coverage. Defaults to sample. Can be any combination of sample, readgroup, library.", required = false)
boolean useReadGroup = false; String[] partitionTypes = new String[] {"sample"};
@Argument(fullName = "useBothSampleAndReadGroup", shortName = "both", doc = "Split output by both read group and by sample", required = false)
boolean useBoth = false;
@Argument(fullName = "includeDeletions", shortName = "dels", doc = "Include information on deletions", required = false) @Argument(fullName = "includeDeletions", shortName = "dels", doc = "Include information on deletions", required = false)
boolean includeDeletions = false; boolean includeDeletions = false;
@Argument(fullName = "ignoreDeletionSites", doc = "Ignore sites consisting only of deletions", required = false) @Argument(fullName = "ignoreDeletionSites", doc = "Ignore sites consisting only of deletions", required = false)
@ -104,7 +102,10 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
String outputFormat = "rtable"; String outputFormat = "rtable";
String[] OUTPUT_FORMATS = {"table","rtable","csv"}; String[] OUTPUT_FORMATS = {"table","rtable","csv"};
String[] PARTITION_TYPES = {"sample","readgroup","library"};
String separator = "\t"; String separator = "\t";
List<CoverageAggregator.AggregationType> aggregationTypes = new ArrayList<CoverageAggregator.AggregationType>();
Map<CoverageAggregator.AggregationType,List<String>> orderCheck = new HashMap<CoverageAggregator.AggregationType,List<String>>();
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
// STANDARD WALKER METHODS // STANDARD WALKER METHODS
@ -124,6 +125,7 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
System.exit(0); System.exit(0);
} }
// Check the output format
boolean goodOutputFormat = false; boolean goodOutputFormat = false;
for ( String f : OUTPUT_FORMATS ) { for ( String f : OUTPUT_FORMATS ) {
goodOutputFormat = goodOutputFormat || f.equals(outputFormat); goodOutputFormat = goodOutputFormat || f.equals(outputFormat);
@ -138,18 +140,33 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
separator = ","; separator = ",";
} }
// Check the partition types
for ( String t : partitionTypes ) {
boolean valid = false;
for ( String s : PARTITION_TYPES ) {
if ( s.equalsIgnoreCase(t) ) {
valid = true;
}
}
if ( ! valid ) {
throw new StingException("The partition type '"+t+"' is not a valid partition type. Please use any combination of 'sample','readgroup','library'.");
} else {
aggregationTypes.add(CoverageAggregator.typeStringToAggregationType(t));
}
}
if ( getToolkit().getArguments().outFileName == null ) { if ( getToolkit().getArguments().outFileName == null ) {
logger.warn("This walker creates many output files from one input file; you may wish to specify an input file rather "+ logger.warn("This walker creates many output files from one input file; you may wish to specify an input file rather "+
"than defaulting all output to stdout."); "than defaulting all output to stdout.");
} }
if ( ! omitDepthOutput ) { // print header if ( ! omitDepthOutput ) { // print header
out.printf("%s\t%s\t%s","Locus","Total_Depth","Average_Depth"); out.printf("%s\t%s","Locus","Total_Depth");
// get all the samples for (CoverageAggregator.AggregationType type : aggregationTypes ) {
HashSet<String> allSamples = getSamplesFromToolKit(useReadGroup); out.printf("\t%s_%s","Average_Depth",agTypeToString(type));
if ( useBoth ) {
allSamples.addAll(getSamplesFromToolKit(!useReadGroup));
} }
// get all the samples
HashSet<String> allSamples = getSamplesFromToolKit(aggregationTypes);
for ( String s : allSamples) { for ( String s : allSamples) {
out.printf("\t%s_%s","Depth_for",s); out.printf("\t%s_%s","Depth_for",s);
@ -163,78 +180,77 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
} else { } else {
out.printf("Per-Locus Depth of Coverage output was omitted"); out.printf("Per-Locus Depth of Coverage output was omitted");
} }
for (CoverageAggregator.AggregationType type : aggregationTypes ) {
orderCheck.put(type,new ArrayList<String>());
for ( String id : getSamplesFromToolKit(type) ) {
orderCheck.get(type).add(id);
}
Collections.sort(orderCheck.get(type));
}
} }
private HashSet<String> getSamplesFromToolKit( boolean getReadGroupsInstead ) { private HashSet<String> getSamplesFromToolKit( List<CoverageAggregator.AggregationType> types ) {
HashSet<String> partitions = new HashSet<String>(); // since the DOCS object uses a HashMap, this will be in the same order HashSet<String> partitions = new HashSet<String>(); // since the DOCS object uses a HashMap, this will be in the same order
for (CoverageAggregator.AggregationType t : types ) {
if ( getReadGroupsInstead ) { partitions.addAll(getSamplesFromToolKit(t));
for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) {
partitions.add(rg.getSample()+"_rg_"+rg.getReadGroupId());
}
} else {
for ( Set<String> sampleSet : getToolkit().getSamplesByReaders()) {
for (String s : sampleSet) {
partitions.add(s);
}
}
} }
return partitions; return partitions;
} }
private HashSet<String> getSamplesFromToolKit(CoverageAggregator.AggregationType type) {
HashSet<String> partition = new HashSet<String>();
if ( type == CoverageAggregator.AggregationType.SAMPLE ) {
for ( Set<String> sampleSet : getToolkit().getSamplesByReaders() ) {
for ( String s : sampleSet ) {
partition.add(s);
}
}
} else if ( type == CoverageAggregator.AggregationType.READGROUP ) {
for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) {
partition.add(rg.getSample()+"_rg_"+rg.getReadGroupId());
}
} else if ( type == CoverageAggregator.AggregationType.LIBRARY ) {
for ( Set<String> libraries : getToolkit().getLibrariesByReaders() ) {
for ( String l : libraries ) {
partition.add(l);
}
}
} else {
throw new StingException("Invalid aggregation type sent to getSamplesFromToolKit");
}
return partition;
}
public boolean isReduceByInterval() { public boolean isReduceByInterval() {
return ( ! omitIntervals ); return ( ! omitIntervals );
} }
public CoverageAggregator reduceInit() { public CoverageAggregator reduceInit() {
CoverageAggregator aggro = new CoverageAggregator(aggregationTypes,start,stop,nBins);
CoverageAggregator.AggregationType agType; for (CoverageAggregator.AggregationType t : aggregationTypes ) {
if ( useBoth ) { aggro.addIdentifiers(t,getSamplesFromToolKit(t));
agType = CoverageAggregator.AggregationType.BOTH;
} else {
agType = useReadGroup ? CoverageAggregator.AggregationType.READ : CoverageAggregator.AggregationType.SAMPLE;
} }
CoverageAggregator aggro = new CoverageAggregator(agType,start,stop,nBins);
if ( ! useReadGroup || useBoth ) {
aggro.addSamples(getSamplesFromToolKit(false));
}
if ( useReadGroup || useBoth ) {
aggro.addReadGroups(getSamplesFromToolKit(true));
}
aggro.initialize(includeDeletions,omitLocusTable); aggro.initialize(includeDeletions,omitLocusTable);
checkOrder(aggro);
return aggro; return aggro;
} }
public Map<String,int[]> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { public Map<CoverageAggregator.AggregationType,Map<String,int[]>> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( ! omitDepthOutput ) { if ( ! omitDepthOutput ) {
out.printf("%s",ref.getLocus()); // yes: print locus in map, and the rest of the info in reduce (for eventual cumulatives) out.printf("%s",ref.getLocus()); // yes: print locus in map, and the rest of the info in reduce (for eventual cumulatives)
//System.out.printf("\t[log]\t%s",ref.getLocus()); //System.out.printf("\t[log]\t%s",ref.getLocus());
} }
Map<String,int[]> countsBySample = CoverageUtils.getBaseCountsBySample(context,minMappingQuality,minBaseQuality, return CoverageUtils.getBaseCountsByPartition(context,minMappingQuality,minBaseQuality,aggregationTypes);
useReadGroup ? CoverageUtils.PartitionType.BY_READ_GROUP : CoverageUtils.PartitionType.BY_SAMPLE);
if ( useBoth ) {
Map<String,int[]> countsByOther = CoverageUtils.getBaseCountsBySample(context,minMappingQuality,minBaseQuality,
!useReadGroup ? CoverageUtils.PartitionType.BY_READ_GROUP : CoverageUtils.PartitionType.BY_SAMPLE);
for ( String s : countsByOther.keySet()) {
countsBySample.put(s,countsByOther.get(s));
}
} }
return countsBySample; public CoverageAggregator reduce(Map<CoverageAggregator.AggregationType,Map<String,int[]>> thisMap, CoverageAggregator prevReduce) {
}
public CoverageAggregator reduce(Map<String,int[]> thisMap, CoverageAggregator prevReduce) {
if ( ! omitDepthOutput ) { if ( ! omitDepthOutput ) {
printDepths(out,thisMap, prevReduce.getAllSamples()); printDepths(out,thisMap, prevReduce.getIdentifiersByType());
// this is an additional iteration through thisMap, plus dealing with IO, so should be much slower without // this is an additional iteration through thisMap, plus dealing with IO, so should be much slower without
// turning on omit // turning on omit
} }
@ -254,20 +270,26 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
public void onTraversalDone( List<Pair<GenomeLoc,CoverageAggregator>> statsByInterval ) { public void onTraversalDone( List<Pair<GenomeLoc,CoverageAggregator>> statsByInterval ) {
if ( refSeqGeneList != null && ( useBoth || ! useReadGroup ) ) { if ( refSeqGeneList != null && aggregationTypes.contains(CoverageAggregator.AggregationType.SAMPLE ) ) {
printGeneStats(statsByInterval); printGeneStats(statsByInterval);
} }
if ( useBoth || ! useReadGroup ) { if ( aggregationTypes.contains(CoverageAggregator.AggregationType.SAMPLE) ) {
File intervalStatisticsFile = deriveFromStream("sample_interval_statistics"); File intervalStatisticsFile = deriveFromStream("sample_interval_statistics");
File intervalSummaryFile = deriveFromStream("sample_interval_summary"); File intervalSummaryFile = deriveFromStream("sample_interval_summary");
printIntervalStats(statsByInterval,intervalSummaryFile, intervalStatisticsFile, true); printIntervalStats(statsByInterval,intervalSummaryFile, intervalStatisticsFile, CoverageAggregator.AggregationType.SAMPLE );
} }
if ( useBoth || useReadGroup ) { if ( aggregationTypes.contains(CoverageAggregator.AggregationType.READGROUP ) ) {
File intervalStatisticsFile = deriveFromStream("read_group_interval_statistics"); File intervalStatisticsFile = deriveFromStream("read_group_interval_statistics");
File intervalSummaryFile = deriveFromStream("read_group_interval_summary"); File intervalSummaryFile = deriveFromStream("read_group_interval_summary");
printIntervalStats(statsByInterval,intervalSummaryFile, intervalStatisticsFile, false); printIntervalStats(statsByInterval,intervalSummaryFile, intervalStatisticsFile, CoverageAggregator.AggregationType.READGROUP);
}
if ( aggregationTypes.contains(CoverageAggregator.AggregationType.LIBRARY) ) {
File intervalStatisticsFile = deriveFromStream("library_interval_statistics");
File intervalSummaryFile = deriveFromStream("library_interval_summary");
printIntervalStats(statsByInterval,intervalSummaryFile,intervalStatisticsFile,CoverageAggregator.AggregationType.LIBRARY);
} }
onTraversalDone(mergeAll(statsByInterval)); onTraversalDone(mergeAll(statsByInterval));
@ -283,7 +305,7 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
return first; return first;
} }
private DepthOfCoverageStats printIntervalStats(List<Pair<GenomeLoc,CoverageAggregator>> statsByInterval, File summaryFile, File statsFile, boolean isSample) { private DepthOfCoverageStats printIntervalStats(List<Pair<GenomeLoc,CoverageAggregator>> statsByInterval, File summaryFile, File statsFile, CoverageAggregator.AggregationType type) {
PrintStream summaryOut; PrintStream summaryOut;
PrintStream statsOut; PrintStream statsOut;
@ -296,7 +318,7 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
Pair<GenomeLoc,CoverageAggregator> firstPair = statsByInterval.get(0); Pair<GenomeLoc,CoverageAggregator> firstPair = statsByInterval.get(0);
CoverageAggregator firstAggregator = firstPair.second; CoverageAggregator firstAggregator = firstPair.second;
DepthOfCoverageStats firstStats = isSample ? firstAggregator.getCoverageBySample() : firstAggregator.getCoverageByReadGroup(); DepthOfCoverageStats firstStats = firstAggregator.getCoverageByAggregationType(type);
StringBuilder summaryHeader = new StringBuilder(); StringBuilder summaryHeader = new StringBuilder();
summaryHeader.append("Target"); summaryHeader.append("Target");
@ -330,8 +352,7 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
for ( Pair<GenomeLoc,CoverageAggregator> targetAggregator : statsByInterval ) { for ( Pair<GenomeLoc,CoverageAggregator> targetAggregator : statsByInterval ) {
Pair<GenomeLoc,DepthOfCoverageStats> targetStats = new Pair<GenomeLoc,DepthOfCoverageStats>( Pair<GenomeLoc,DepthOfCoverageStats> targetStats = new Pair<GenomeLoc,DepthOfCoverageStats>(
targetAggregator.first, isSample ? targetAggregator.second.getCoverageBySample() : targetAggregator.first, targetAggregator.second.getCoverageByAggregationType(type));
targetAggregator.second.getCoverageByReadGroup());
printTargetSummary(summaryOut,targetStats); printTargetSummary(summaryOut,targetStats);
updateTargetTable(nTargetsByAvgCvgBySample,targetStats.second); updateTargetTable(nTargetsByAvgCvgBySample,targetStats.second);
} }
@ -354,10 +375,10 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
for ( Pair<GenomeLoc,CoverageAggregator> targetStats : statsByTarget ) { for ( Pair<GenomeLoc,CoverageAggregator> targetStats : statsByTarget ) {
String gene = getGeneName(targetStats.first,refseqIterator); String gene = getGeneName(targetStats.first,refseqIterator);
if ( geneNamesToStats.keySet().contains(gene) ) { if ( geneNamesToStats.keySet().contains(gene) ) {
geneNamesToStats.get(gene).merge(targetStats.second.getCoverageBySample()); geneNamesToStats.get(gene).merge(targetStats.second.getCoverageByAggregationType(CoverageAggregator.AggregationType.SAMPLE));
} else { } else {
geneNamesToStats.put(gene,targetStats.second.getCoverageBySample()); geneNamesToStats.put(gene,targetStats.second.getCoverageByAggregationType(CoverageAggregator.AggregationType.SAMPLE));
statsByGene.add(new Pair<String,DepthOfCoverageStats>(gene,targetStats.second.getCoverageBySample())); statsByGene.add(new Pair<String,DepthOfCoverageStats>(gene,new DepthOfCoverageStats(targetStats.second.getCoverageByAggregationType(CoverageAggregator.AggregationType.SAMPLE))));
} }
} }
@ -498,35 +519,42 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
if ( ! omitSampleSummary ) { if ( ! omitSampleSummary ) {
logger.info("Printing summary info"); logger.info("Printing summary info");
if ( ! useReadGroup || useBoth ) { for (CoverageAggregator.AggregationType type : aggregationTypes ) {
File summaryStatisticsFile = deriveFromStream("sample_summary_statistics"); outputSummaryFiles(coverageProfiles,type);
File perSampleStatisticsFile = deriveFromStream("sample_statistics");
printSummary(out,summaryStatisticsFile,coverageProfiles.getCoverageBySample());
printPerSample(out,perSampleStatisticsFile,coverageProfiles.getCoverageBySample());
}
if ( useReadGroup || useBoth ) {
File rgStatsFile = deriveFromStream("read_group_summary");
File rgSumFile = deriveFromStream("read_group_statistics");
printSummary(out,rgStatsFile,coverageProfiles.getCoverageByReadGroup());
printPerSample(out,rgSumFile,coverageProfiles.getCoverageByReadGroup());
} }
} }
if ( ! omitLocusTable ) { if ( ! omitLocusTable ) {
logger.info("Printing locus summary"); logger.info("Printing locus summary");
if ( ! useReadGroup || useBoth ) { for (CoverageAggregator.AggregationType type : aggregationTypes ) {
File perLocusStatisticsFile = deriveFromStream("sample_locus_statistics"); outputLocusFiles(coverageProfiles,type);
File perLocusCoverageFile = deriveFromStream("sample_coverage_statistics"); }
printPerLocus(perLocusStatisticsFile,perLocusCoverageFile,coverageProfiles.getCoverageBySample(),false); }
} }
if ( useReadGroup || useBoth ) { private String agTypeToString(CoverageAggregator.AggregationType type) {
File perLocusRGStats = deriveFromStream("read_group_locus_statistics"); if ( type == CoverageAggregator.AggregationType.SAMPLE ) {
File perLocusRGCoverage = deriveFromStream("read_group_locus_coverage"); return "sample";
printPerLocus(perLocusRGStats,perLocusRGCoverage,coverageProfiles.getCoverageByReadGroup(),true); } else if ( type == CoverageAggregator.AggregationType.READGROUP ) {
return "read_group";
} else if ( type == CoverageAggregator.AggregationType.LIBRARY ) {
return "library";
} else {
throw new StingException("Invalid aggregation type "+type+" sent to agTypeToString. This is a BUG!");
} }
} }
private void outputLocusFiles(CoverageAggregator coverageProfiles, CoverageAggregator.AggregationType type ) {
File locusStats = deriveFromStream(agTypeToString(type)+"_cumulative_coverage_counts");
File coverageStats = deriveFromStream(agTypeToString(type)+"_cumulative_coverage_proportions");
printPerLocus(locusStats,coverageStats,coverageProfiles.getCoverageByAggregationType(type),agTypeToString(type));
}
private void outputSummaryFiles(CoverageAggregator coverageProfiles, CoverageAggregator.AggregationType type ) {
File summaryFile = deriveFromStream(agTypeToString(type)+"_summary");
File statsFile = deriveFromStream(agTypeToString(type)+"_statistics");
printPerSample(out,statsFile,coverageProfiles.getCoverageByAggregationType(type));
printSummary(out,summaryFile,coverageProfiles.getCoverageByAggregationType(type));
} }
public File deriveFromStream(String append) { public File deriveFromStream(String append) {
@ -568,7 +596,7 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
} }
} }
private void printPerLocus(File locusFile, File coverageFile, DepthOfCoverageStats stats, boolean isRG) { private void printPerLocus(File locusFile, File coverageFile, DepthOfCoverageStats stats, String partitionType) {
PrintStream output = getCorrectStream(out,locusFile); PrintStream output = getCorrectStream(out,locusFile);
PrintStream coverageOut = getCorrectStream(out,coverageFile); PrintStream coverageOut = getCorrectStream(out,coverageFile);
if ( output == null ) { if ( output == null ) {
@ -587,11 +615,7 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
StringBuilder header = new StringBuilder(); StringBuilder header = new StringBuilder();
if ( printSampleColumnHeader ) { if ( printSampleColumnHeader ) {
if ( isRG ) { header.append(partitionType);
header.append("ReadGroup");
} else {
header.append("Sample");
}
} }
header.append(String.format("%sgte_0",separator)); header.append(String.format("%sgte_0",separator));
for ( int d : endpoints ) { for ( int d : endpoints ) {
@ -689,24 +713,34 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
return bin; return bin;
} }
private void printDepths(PrintStream stream, Map<String,int[]> countsBySample, Set<String> allSamples) { private void printDepths(PrintStream stream, Map<CoverageAggregator.AggregationType,Map<String,int[]>> countsBySampleByType, Map<CoverageAggregator.AggregationType,List<String>> identifiersByType) {
// get the depths per sample and build up the output string while tabulating total and average coverage // get the depths per sample and build up the output string while tabulating total and average coverage
// todo -- update me to deal with base counts/indels
StringBuilder perSampleOutput = new StringBuilder(); StringBuilder perSampleOutput = new StringBuilder();
int tDepth = 0; int tDepth = 0;
for ( String s : allSamples ) { boolean depthCounted = false;
for (CoverageAggregator.AggregationType type : aggregationTypes ) {
Map<String,int[]> countsByID = countsBySampleByType.get(type);
for ( String s : identifiersByType.get(type) ) {
perSampleOutput.append(separator); perSampleOutput.append(separator);
long dp = countsBySample.keySet().contains(s) ? sumArray(countsBySample.get(s)) : 0; long dp = countsByID.keySet().contains(s) ? sumArray(countsByID.get(s)) : 0 ;
perSampleOutput.append(dp); perSampleOutput.append(dp);
if ( printBaseCounts ) { if ( printBaseCounts ) {
perSampleOutput.append(separator); perSampleOutput.append(separator);
perSampleOutput.append(baseCounts(countsBySample.get(s))); perSampleOutput.append(baseCounts(countsByID.get(s)));
} }
if ( ! depthCounted ) {
tDepth += dp; tDepth += dp;
} }
}
depthCounted = true; // only sum the total depth once
}
// remember -- genome locus was printed in map() // remember -- genome locus was printed in map()
stream.printf("%s%d%s%.2f%s%n",separator,tDepth,separator,( (double) tDepth/ (double) allSamples.size()), stream.printf("%s%d",separator,tDepth);
perSampleOutput); for (CoverageAggregator.AggregationType type : aggregationTypes ) {
stream.printf("%s%.2f",separator, ( (double) tDepth / identifiersByType.get(type).size() ) );
}
stream.printf("%s%n",perSampleOutput);
} }
private long sumArray(int[] array) { private long sumArray(int[] array) {
@ -737,115 +771,99 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<String,int[]>, Covera
return s.toString(); return s.toString();
} }
private void checkOrder(CoverageAggregator ag) {
for (CoverageAggregator.AggregationType t : aggregationTypes ) {
List<String> order = orderCheck.get(t);
List<String> namesInAg = ag.getIdentifiersByType().get(t);
Set<String> namesInDOCS = ag.getCoverageByAggregationType(t).getAllSamples();
int index = 0;
for ( String s : namesInAg ) {
if ( ! s.equalsIgnoreCase(order.get(index)) ) {
throw new StingException("IDs are out of order for type "+t+"! Aggregator has different ordering");
}
index++;
}
}
}
} }
class CoverageAggregator { class CoverageAggregator {
private DepthOfCoverageStats coverageByRead;
private DepthOfCoverageStats coverageBySample;
enum AggregationType { READ, SAMPLE, BOTH } enum AggregationType { READGROUP, SAMPLE, LIBRARY }
private AggregationType agType; private List<AggregationType> types;
private Set<String> sampleNames; private Map<AggregationType,DepthOfCoverageStats> coverageProfiles;
private Set<String> allSamples; private Map<AggregationType,List<String>> identifiersByType;
private Set<String> allIdentifiers;
public CoverageAggregator(AggregationType type, int start, int stop, int nBins) { public CoverageAggregator(List<AggregationType> typesToUse, int start, int stop, int nBins) {
if ( type == AggregationType.READ || type == AggregationType.BOTH) { coverageProfiles = new HashMap<AggregationType,DepthOfCoverageStats>();
coverageByRead = new DepthOfCoverageStats(DepthOfCoverageStats.calculateBinEndpoints(start,stop,nBins)); identifiersByType = new HashMap<AggregationType,List<String>>();
types = typesToUse;
for ( AggregationType type : types ) {
coverageProfiles.put(type,new DepthOfCoverageStats(DepthOfCoverageStats.calculateBinEndpoints(start,stop,nBins)));
identifiersByType.put(type,new ArrayList<String>());
} }
allIdentifiers = new HashSet<String>();
if ( type == AggregationType.SAMPLE || type == AggregationType.BOTH) {
coverageBySample = new DepthOfCoverageStats(DepthOfCoverageStats.calculateBinEndpoints(start,stop,nBins));
}
agType = type;
allSamples = new HashSet<String>();
} }
public void merge(CoverageAggregator otherAggregator) { public void merge(CoverageAggregator otherAggregator) {
if ( agType == AggregationType.SAMPLE || agType == AggregationType.BOTH ) { for ( AggregationType type : types ) {
this.coverageBySample.merge(otherAggregator.coverageBySample); this.coverageProfiles.get(type).merge(otherAggregator.coverageProfiles.get(type));
}
if ( agType == AggregationType.READ || agType == AggregationType.BOTH) {
this.coverageByRead.merge(otherAggregator.coverageByRead);
} }
} }
public DepthOfCoverageStats getCoverageByReadGroup() { public DepthOfCoverageStats getCoverageByAggregationType(AggregationType t) {
return coverageByRead; return coverageProfiles.get(t);
} }
public DepthOfCoverageStats getCoverageBySample() { public void addIdentifiers(AggregationType t, Set<String> ids) {
return coverageBySample; for ( String s : ids ) {
coverageProfiles.get(t).addSample(s);
identifiersByType.get(t).add(s);
allIdentifiers.add(s);
} }
Collections.sort(identifiersByType.get(t));
public void addSamples(Set<String> samples) {
for ( String s : samples ) {
coverageBySample.addSample(s);
allSamples.add(s);
} }
if ( agType == AggregationType.BOTH ) {
sampleNames = samples;
}
}
public void addReadGroups(Set<String> readGroupNames) {
for ( String s : readGroupNames ) {
coverageByRead.addSample(s);
allSamples.add(s);
}
}
public void initialize(boolean useDels, boolean omitLocusTable) { public void initialize(boolean useDels, boolean omitLocusTable) {
if ( agType == AggregationType.SAMPLE || agType == AggregationType.BOTH ) { for ( AggregationType t : types ) {
if ( useDels ) { if ( useDels ) {
coverageBySample.initializeDeletions(); coverageProfiles.get(t).initializeDeletions();
} }
if ( ! omitLocusTable ) { if ( ! omitLocusTable ) {
coverageBySample.initializeLocusCounts(); coverageProfiles.get(t).initializeLocusCounts();
}
}
if ( agType == AggregationType.READ || agType == AggregationType.BOTH) {
if ( useDels ) {
coverageByRead.initializeDeletions();
}
if ( ! omitLocusTable ) {
coverageByRead.initializeLocusCounts();
} }
} }
} }
public void update(Map<String,int[]> countsByIdentifier) { public void update(Map<AggregationType,Map<String,int[]>> countsByIdentifierByType) {
if ( agType != AggregationType.BOTH ) { for ( AggregationType t : types ) {
if ( agType == AggregationType.SAMPLE ) { coverageProfiles.get(t).update(countsByIdentifierByType.get(t));
coverageBySample.update(countsByIdentifier);
} else {
coverageByRead.update(countsByIdentifier);
}
} else { // have to separate samples and read groups
HashMap<String,int[]> countsBySample = new HashMap<String,int[]>(sampleNames.size());
HashMap<String,int[]> countsByRG = new HashMap<String,int[]>(allSamples.size()-sampleNames.size());
for ( String s : countsByIdentifier.keySet() ) {
if ( sampleNames.contains(s) ) {
countsBySample.put(s,countsByIdentifier.get(s));
} else { // cannot use .remove() to save time due to concurrency issues
countsByRG.put(s,countsByIdentifier.get(s));
} }
} }
coverageBySample.update(countsBySample); public Set<String> getAllIdentifiers() {
coverageByRead.update(countsByRG); return allIdentifiers;
}
} }
public Set<String> getAllSamples() { public Map<AggregationType,List<String>> getIdentifiersByType() {
return allSamples; return identifiersByType;
} }
public static AggregationType typeStringToAggregationType(String type) {
if ( type.equals("sample") ) {
return AggregationType.SAMPLE;
}
if ( type.equals("library") ) {
return AggregationType.LIBRARY;
}
if ( type.equals("readgroup") ) {
return AggregationType.READGROUP;
}
throw new StingException("Valid partition type string "+type+" is not associated with an aggregation type. This is a BUG!");
}
} }

View File

@ -53,21 +53,29 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest {
String[] intervals = {"/humgen/gsa-hpprojects/GATK/data/Validation_Data/fhs_jhs_30_targts.interval_list"}; String[] intervals = {"/humgen/gsa-hpprojects/GATK/data/Validation_Data/fhs_jhs_30_targts.interval_list"};
String[] bams = {"/humgen/gsa-hpprojects/GATK/data/Validation_Data/FHS_indexed_subset.bam"}; String[] bams = {"/humgen/gsa-hpprojects/GATK/data/Validation_Data/FHS_indexed_subset.bam"};
String cmd = buildRootCmd(hg18,new ArrayList<String>(Arrays.asList(bams)),new ArrayList<String>(Arrays.asList(intervals))) + " -mmq 0 -mbq 0 -dels -baseCounts -both --outputFormat csv"; String cmd = buildRootCmd(hg18,new ArrayList<String>(Arrays.asList(bams)),new ArrayList<String>(Arrays.asList(intervals))) + " -mmq 0 -mbq 0 -dels -baseCounts -pt readgroup -pt sample -pt library --outputFormat csv";
WalkerTestSpec spec = new WalkerTestSpec(cmd,0, new ArrayList<String>()); WalkerTestSpec spec = new WalkerTestSpec(cmd,0, new ArrayList<String>());
// now add the expected files that get generated // now add the expected files that get generated
spec.addAuxFile("f53ddf25c2b71e46381f9c434402d88d", baseOutputFile); spec.addAuxFile("fc742e346be2344557cf8c039f467508", baseOutputFile);
spec.addAuxFile("925cc5b49286e0222bce6251d1baafc7", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_statistics")); spec.addAuxFile("e58b701b01ec0dbe75c146295434ba3b", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_counts"));
spec.addAuxFile("d9e812398d778f28ed12d7f3d18628e2", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); spec.addAuxFile("b9a7748e5aec4dc06daed893c901c00d", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_proportions"));
spec.addAuxFile("80577bf378de570f84d91b0ef6004102", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_locus_statistics")); spec.addAuxFile("848e556ec7e03e9b0398d189d7cbb4ad", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_statistics"));
spec.addAuxFile("3a059ad82d945643dc4e03239c4041f5", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary_statistics")); spec.addAuxFile("e6fc8f7a5fcc440e21de5891f3403d5d", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary"));
spec.addAuxFile("f3315551081331bc322c53b11412d707", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics")); spec.addAuxFile("cac8e7a688d9bbe781232c61091d3237", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_statistics"));
spec.addAuxFile("e452dfb5581a7aafaf2122c5ae497f1b", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_summary"));
spec.addAuxFile("38fb89e1bb52d0342f97f72e86956b79", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_counts"));
spec.addAuxFile("f9f2941ee39577ac2f80668e7f6b3d4b", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_proportions"));
spec.addAuxFile("7b9d0e93bf5b5313995be7010ef1f528", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_statistics")); spec.addAuxFile("7b9d0e93bf5b5313995be7010ef1f528", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_statistics"));
spec.addAuxFile("fd29fe0c14351e934a6fef9df1f38f7b", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_summary")); spec.addAuxFile("fd29fe0c14351e934a6fef9df1f38f7b", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_summary"));
spec.addAuxFile("111261f0e8ccf8c456d0b2a9482bc32c", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_locus_statistics"));
spec.addAuxFile("cc7ee5075a932dba486e78824ca34202", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_statistics")); spec.addAuxFile("cc7ee5075a932dba486e78824ca34202", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_statistics"));
spec.addAuxFile("e1653480daa2d96f7c584ed4cd20c147", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_summary")); spec.addAuxFile("e1653480daa2d96f7c584ed4cd20c147", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_summary"));
spec.addAuxFile("529353375d23c529228b38119c51e269", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_counts"));
spec.addAuxFile("650ee3714da7fbad7832c9d4ad49eb51", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_proportions"));
spec.addAuxFile("925cc5b49286e0222bce6251d1baafc7", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_statistics"));
spec.addAuxFile("d9e812398d778f28ed12d7f3d18628e2", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary"));
spec.addAuxFile("f3315551081331bc322c53b11412d707", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics"));
spec.addAuxFile("3a059ad82d945643dc4e03239c4041f5", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary"));
execute("testBaseOutputNoFiltering",spec); execute("testBaseOutputNoFiltering",spec);
} }