From e016491a3db9b6c71e5537b68152c9648d9fee50 Mon Sep 17 00:00:00 2001 From: chartl Date: Tue, 18 May 2010 16:58:13 +0000 Subject: [PATCH] Major refactoring of Depth of Coverage to allow for more extensible partitions of data (now can do read group, sample, and library; in any combination; adding more is fairly easy). Changed the by-gene code to use clones of stats objects, rather than munging the interval DoCs. (Fix for Avinash. Who, hilariously, thinks my name is Carl.) Added sorting methods to ensure static ordering of header and body fields. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3377 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/coverage/CoverageUtils.java | 40 +- .../coverage/DepthOfCoverageStats.java | 13 + .../coverage/DepthOfCoverageWalker.java | 392 +++++++++--------- .../DepthOfCoverageIntegrationTest.java | 24 +- 4 files changed, 262 insertions(+), 207 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java b/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java index e35705eee..7538b8693 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java @@ -1,13 +1,15 @@ package org.broadinstitute.sting.gatk.walkers.coverage; +import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement; import java.util.HashMap; +import java.util.List; import java.util.Map; +import java.util.Set; /** * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl @@ -17,8 +19,6 @@ import java.util.Map; */ public class CoverageUtils { - public enum PartitionType { BY_READ_GROUP, BY_SAMPLE } - /** * Returns the counts of bases from reads with MAPQ > minMapQ and base quality > minBaseQ in the context * as an array of ints, indexed by the index fields of BaseUtils @@ -40,21 +40,37 @@ public class CoverageUtils { return counts; } + public static String getTypeID( SAMRecord r, CoverageAggregator.AggregationType type ) { + if ( type == CoverageAggregator.AggregationType.SAMPLE ) { + return r.getReadGroup().getSample(); + } else if ( type == CoverageAggregator.AggregationType.READGROUP ) { + return String.format("%s_rg_%s",r.getReadGroup().getSample(),r.getReadGroup().getReadGroupId()); + } else if ( type == CoverageAggregator.AggregationType.LIBRARY ) { + return r.getReadGroup().getLibrary(); + } else { + throw new StingException("Invalid type ID sent to getTypeID. This is a BUG!"); + } + } + public static Map> + getBaseCountsByPartition(AlignmentContext context, int minMapQ, int minBaseQ, List types) { - public static Map getBaseCountsBySample(AlignmentContext context, int minMapQ, int minBaseQ, PartitionType type) { - Map samplesToCounts = new HashMap(); - + Map> countsByIDByType = new HashMap>(); + for (CoverageAggregator.AggregationType t : types ) { + countsByIDByType.put(t,new HashMap()); + } for (PileupElement e : context.getBasePileup()) { if ( e.getMappingQual() >= minMapQ && ( e.getQual() >= minBaseQ || e.isDeletion() ) ) { - String sample = type == PartitionType.BY_SAMPLE ? e.getRead().getReadGroup().getSample() : - e.getRead().getReadGroup().getSample()+"_rg_"+e.getRead().getReadGroup().getReadGroupId(); - if ( ! samplesToCounts.keySet().contains(sample) ) - samplesToCounts.put(sample,new int[6]); - updateCounts(samplesToCounts.get(sample),e); + for (CoverageAggregator.AggregationType t : types ) { + String id = getTypeID(e.getRead(),t); + if ( ! countsByIDByType.get(t).keySet().contains(id) ) { + countsByIDByType.get(t).put(id,new int[6]); + } + updateCounts(countsByIDByType.get(t).get(id),e); + } } } - return samplesToCounts; + return countsByIDByType; } private static void updateCounts(int[] counts, PileupElement e) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java b/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java index 70d1d0f26..6114a4024 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java @@ -84,6 +84,19 @@ public class DepthOfCoverageStats { totalDepthOfCoverage = 0; } + public DepthOfCoverageStats(DepthOfCoverageStats cloneMe) { + this.binLeftEndpoints = cloneMe.binLeftEndpoints; + this.granularHistogramBySample = cloneMe.granularHistogramBySample; + this.totalCoverages = cloneMe.totalCoverages; + this.nLoci = cloneMe.nLoci; + this.totalLocusDepth = cloneMe.totalLocusDepth; + this.totalDepthOfCoverage = cloneMe.totalDepthOfCoverage; + this.locusHistogram = cloneMe.locusHistogram; + this.locusCoverageCounts = cloneMe.locusCoverageCounts; + this.tabulateLocusCounts = cloneMe.tabulateLocusCounts; + this.includeDeletions = cloneMe.includeDeletions; + } + public void addSample(String sample) { if ( granularHistogramBySample.containsKey(sample) ) { return; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java index 9f2cadedc..d9e9fa584 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java @@ -67,7 +67,7 @@ import java.util.*; // todo -- allow for user to set linear binning (default is logarithmic) // todo -- formatting --> do something special for end bins in getQuantile(int[] foo), this gets mushed into the end+-1 bins for now @By(DataSource.REFERENCE) -public class DepthOfCoverageWalker extends LocusWalker, CoverageAggregator> implements TreeReducible { +public class DepthOfCoverageWalker extends LocusWalker>, CoverageAggregator> implements TreeReducible { @Argument(fullName = "start", doc = "Starting (left endpoint) for granular binning", required = false) int start = 1; @Argument(fullName = "stop", doc = "Ending (right endpoint) for granular binning", required = false) @@ -90,10 +90,8 @@ public class DepthOfCoverageWalker extends LocusWalker, Covera boolean printBinEndpointsAndExit = false; @Argument(fullName = "omitPerSampleStats", shortName = "omitSampleSummary", doc = "Omits the summary files per-sample. These statistics are still calculated, so this argument will not improve runtime.", required = false) boolean omitSampleSummary = false; - @Argument(fullName = "useReadGroups", shortName = "rg", doc = "Split depth of coverage output by read group rather than by sample", required = false) - boolean useReadGroup = false; - @Argument(fullName = "useBothSampleAndReadGroup", shortName = "both", doc = "Split output by both read group and by sample", required = false) - boolean useBoth = false; + @Argument(fullName = "partitionType", shortName = "pt", doc = "Partition type for depth of coverage. Defaults to sample. Can be any combination of sample, readgroup, library.", required = false) + String[] partitionTypes = new String[] {"sample"}; @Argument(fullName = "includeDeletions", shortName = "dels", doc = "Include information on deletions", required = false) boolean includeDeletions = false; @Argument(fullName = "ignoreDeletionSites", doc = "Ignore sites consisting only of deletions", required = false) @@ -104,7 +102,10 @@ public class DepthOfCoverageWalker extends LocusWalker, Covera String outputFormat = "rtable"; String[] OUTPUT_FORMATS = {"table","rtable","csv"}; + String[] PARTITION_TYPES = {"sample","readgroup","library"}; String separator = "\t"; + List aggregationTypes = new ArrayList(); + Map> orderCheck = new HashMap>(); //////////////////////////////////////////////////////////////////////////////////// // STANDARD WALKER METHODS @@ -124,6 +125,7 @@ public class DepthOfCoverageWalker extends LocusWalker, Covera System.exit(0); } + // Check the output format boolean goodOutputFormat = false; for ( String f : OUTPUT_FORMATS ) { goodOutputFormat = goodOutputFormat || f.equals(outputFormat); @@ -138,18 +140,33 @@ public class DepthOfCoverageWalker extends LocusWalker, Covera separator = ","; } + // Check the partition types + for ( String t : partitionTypes ) { + boolean valid = false; + for ( String s : PARTITION_TYPES ) { + if ( s.equalsIgnoreCase(t) ) { + valid = true; + } + } + if ( ! valid ) { + throw new StingException("The partition type '"+t+"' is not a valid partition type. Please use any combination of 'sample','readgroup','library'."); + } else { + aggregationTypes.add(CoverageAggregator.typeStringToAggregationType(t)); + } + } + if ( getToolkit().getArguments().outFileName == null ) { logger.warn("This walker creates many output files from one input file; you may wish to specify an input file rather "+ "than defaulting all output to stdout."); } if ( ! omitDepthOutput ) { // print header - out.printf("%s\t%s\t%s","Locus","Total_Depth","Average_Depth"); - // get all the samples - HashSet allSamples = getSamplesFromToolKit(useReadGroup); - if ( useBoth ) { - allSamples.addAll(getSamplesFromToolKit(!useReadGroup)); + out.printf("%s\t%s","Locus","Total_Depth"); + for (CoverageAggregator.AggregationType type : aggregationTypes ) { + out.printf("\t%s_%s","Average_Depth",agTypeToString(type)); } + // get all the samples + HashSet allSamples = getSamplesFromToolKit(aggregationTypes); for ( String s : allSamples) { out.printf("\t%s_%s","Depth_for",s); @@ -163,78 +180,77 @@ public class DepthOfCoverageWalker extends LocusWalker, Covera } else { out.printf("Per-Locus Depth of Coverage output was omitted"); } + + for (CoverageAggregator.AggregationType type : aggregationTypes ) { + orderCheck.put(type,new ArrayList()); + for ( String id : getSamplesFromToolKit(type) ) { + orderCheck.get(type).add(id); + } + Collections.sort(orderCheck.get(type)); + } } - private HashSet getSamplesFromToolKit( boolean getReadGroupsInstead ) { + private HashSet getSamplesFromToolKit( List types ) { HashSet partitions = new HashSet(); // since the DOCS object uses a HashMap, this will be in the same order - - if ( getReadGroupsInstead ) { - for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { - partitions.add(rg.getSample()+"_rg_"+rg.getReadGroupId()); - } - } else { - for ( Set sampleSet : getToolkit().getSamplesByReaders()) { - for (String s : sampleSet) { - partitions.add(s); - } - } + for (CoverageAggregator.AggregationType t : types ) { + partitions.addAll(getSamplesFromToolKit(t)); } return partitions; } + private HashSet getSamplesFromToolKit(CoverageAggregator.AggregationType type) { + HashSet partition = new HashSet(); + if ( type == CoverageAggregator.AggregationType.SAMPLE ) { + for ( Set sampleSet : getToolkit().getSamplesByReaders() ) { + for ( String s : sampleSet ) { + partition.add(s); + } + } + } else if ( type == CoverageAggregator.AggregationType.READGROUP ) { + for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { + partition.add(rg.getSample()+"_rg_"+rg.getReadGroupId()); + } + } else if ( type == CoverageAggregator.AggregationType.LIBRARY ) { + for ( Set libraries : getToolkit().getLibrariesByReaders() ) { + for ( String l : libraries ) { + partition.add(l); + } + } + } else { + throw new StingException("Invalid aggregation type sent to getSamplesFromToolKit"); + } + + return partition; + } + public boolean isReduceByInterval() { return ( ! omitIntervals ); } public CoverageAggregator reduceInit() { - - CoverageAggregator.AggregationType agType; - if ( useBoth ) { - agType = CoverageAggregator.AggregationType.BOTH; - } else { - agType = useReadGroup ? CoverageAggregator.AggregationType.READ : CoverageAggregator.AggregationType.SAMPLE; + CoverageAggregator aggro = new CoverageAggregator(aggregationTypes,start,stop,nBins); + for (CoverageAggregator.AggregationType t : aggregationTypes ) { + aggro.addIdentifiers(t,getSamplesFromToolKit(t)); } - - CoverageAggregator aggro = new CoverageAggregator(agType,start,stop,nBins); - - if ( ! useReadGroup || useBoth ) { - aggro.addSamples(getSamplesFromToolKit(false)); - } - - if ( useReadGroup || useBoth ) { - aggro.addReadGroups(getSamplesFromToolKit(true)); - } - aggro.initialize(includeDeletions,omitLocusTable); - + checkOrder(aggro); return aggro; } - public Map map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public Map> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( ! omitDepthOutput ) { out.printf("%s",ref.getLocus()); // yes: print locus in map, and the rest of the info in reduce (for eventual cumulatives) //System.out.printf("\t[log]\t%s",ref.getLocus()); } - Map countsBySample = CoverageUtils.getBaseCountsBySample(context,minMappingQuality,minBaseQuality, - useReadGroup ? CoverageUtils.PartitionType.BY_READ_GROUP : CoverageUtils.PartitionType.BY_SAMPLE); - - if ( useBoth ) { - Map countsByOther = CoverageUtils.getBaseCountsBySample(context,minMappingQuality,minBaseQuality, - !useReadGroup ? CoverageUtils.PartitionType.BY_READ_GROUP : CoverageUtils.PartitionType.BY_SAMPLE); - for ( String s : countsByOther.keySet()) { - countsBySample.put(s,countsByOther.get(s)); - } - } - - return countsBySample; + return CoverageUtils.getBaseCountsByPartition(context,minMappingQuality,minBaseQuality,aggregationTypes); } - public CoverageAggregator reduce(Map thisMap, CoverageAggregator prevReduce) { + public CoverageAggregator reduce(Map> thisMap, CoverageAggregator prevReduce) { if ( ! omitDepthOutput ) { - printDepths(out,thisMap, prevReduce.getAllSamples()); + printDepths(out,thisMap, prevReduce.getIdentifiersByType()); // this is an additional iteration through thisMap, plus dealing with IO, so should be much slower without // turning on omit } @@ -254,20 +270,26 @@ public class DepthOfCoverageWalker extends LocusWalker, Covera //////////////////////////////////////////////////////////////////////////////////// public void onTraversalDone( List> statsByInterval ) { - if ( refSeqGeneList != null && ( useBoth || ! useReadGroup ) ) { + if ( refSeqGeneList != null && aggregationTypes.contains(CoverageAggregator.AggregationType.SAMPLE ) ) { printGeneStats(statsByInterval); } - if ( useBoth || ! useReadGroup ) { + if ( aggregationTypes.contains(CoverageAggregator.AggregationType.SAMPLE) ) { File intervalStatisticsFile = deriveFromStream("sample_interval_statistics"); File intervalSummaryFile = deriveFromStream("sample_interval_summary"); - printIntervalStats(statsByInterval,intervalSummaryFile, intervalStatisticsFile, true); + printIntervalStats(statsByInterval,intervalSummaryFile, intervalStatisticsFile, CoverageAggregator.AggregationType.SAMPLE ); } - if ( useBoth || useReadGroup ) { + if ( aggregationTypes.contains(CoverageAggregator.AggregationType.READGROUP ) ) { File intervalStatisticsFile = deriveFromStream("read_group_interval_statistics"); File intervalSummaryFile = deriveFromStream("read_group_interval_summary"); - printIntervalStats(statsByInterval,intervalSummaryFile, intervalStatisticsFile, false); + printIntervalStats(statsByInterval,intervalSummaryFile, intervalStatisticsFile, CoverageAggregator.AggregationType.READGROUP); + } + + if ( aggregationTypes.contains(CoverageAggregator.AggregationType.LIBRARY) ) { + File intervalStatisticsFile = deriveFromStream("library_interval_statistics"); + File intervalSummaryFile = deriveFromStream("library_interval_summary"); + printIntervalStats(statsByInterval,intervalSummaryFile,intervalStatisticsFile,CoverageAggregator.AggregationType.LIBRARY); } onTraversalDone(mergeAll(statsByInterval)); @@ -283,7 +305,7 @@ public class DepthOfCoverageWalker extends LocusWalker, Covera return first; } - private DepthOfCoverageStats printIntervalStats(List> statsByInterval, File summaryFile, File statsFile, boolean isSample) { + private DepthOfCoverageStats printIntervalStats(List> statsByInterval, File summaryFile, File statsFile, CoverageAggregator.AggregationType type) { PrintStream summaryOut; PrintStream statsOut; @@ -296,7 +318,7 @@ public class DepthOfCoverageWalker extends LocusWalker, Covera Pair firstPair = statsByInterval.get(0); CoverageAggregator firstAggregator = firstPair.second; - DepthOfCoverageStats firstStats = isSample ? firstAggregator.getCoverageBySample() : firstAggregator.getCoverageByReadGroup(); + DepthOfCoverageStats firstStats = firstAggregator.getCoverageByAggregationType(type); StringBuilder summaryHeader = new StringBuilder(); summaryHeader.append("Target"); @@ -330,8 +352,7 @@ public class DepthOfCoverageWalker extends LocusWalker, Covera for ( Pair targetAggregator : statsByInterval ) { Pair targetStats = new Pair( - targetAggregator.first, isSample ? targetAggregator.second.getCoverageBySample() : - targetAggregator.second.getCoverageByReadGroup()); + targetAggregator.first, targetAggregator.second.getCoverageByAggregationType(type)); printTargetSummary(summaryOut,targetStats); updateTargetTable(nTargetsByAvgCvgBySample,targetStats.second); } @@ -354,10 +375,10 @@ public class DepthOfCoverageWalker extends LocusWalker, Covera for ( Pair targetStats : statsByTarget ) { String gene = getGeneName(targetStats.first,refseqIterator); if ( geneNamesToStats.keySet().contains(gene) ) { - geneNamesToStats.get(gene).merge(targetStats.second.getCoverageBySample()); + geneNamesToStats.get(gene).merge(targetStats.second.getCoverageByAggregationType(CoverageAggregator.AggregationType.SAMPLE)); } else { - geneNamesToStats.put(gene,targetStats.second.getCoverageBySample()); - statsByGene.add(new Pair(gene,targetStats.second.getCoverageBySample())); + geneNamesToStats.put(gene,targetStats.second.getCoverageByAggregationType(CoverageAggregator.AggregationType.SAMPLE)); + statsByGene.add(new Pair(gene,new DepthOfCoverageStats(targetStats.second.getCoverageByAggregationType(CoverageAggregator.AggregationType.SAMPLE)))); } } @@ -498,37 +519,44 @@ public class DepthOfCoverageWalker extends LocusWalker, Covera if ( ! omitSampleSummary ) { logger.info("Printing summary info"); - if ( ! useReadGroup || useBoth ) { - File summaryStatisticsFile = deriveFromStream("sample_summary_statistics"); - File perSampleStatisticsFile = deriveFromStream("sample_statistics"); - printSummary(out,summaryStatisticsFile,coverageProfiles.getCoverageBySample()); - printPerSample(out,perSampleStatisticsFile,coverageProfiles.getCoverageBySample()); - } - - if ( useReadGroup || useBoth ) { - File rgStatsFile = deriveFromStream("read_group_summary"); - File rgSumFile = deriveFromStream("read_group_statistics"); - printSummary(out,rgStatsFile,coverageProfiles.getCoverageByReadGroup()); - printPerSample(out,rgSumFile,coverageProfiles.getCoverageByReadGroup()); + for (CoverageAggregator.AggregationType type : aggregationTypes ) { + outputSummaryFiles(coverageProfiles,type); } } if ( ! omitLocusTable ) { logger.info("Printing locus summary"); - if ( ! useReadGroup || useBoth ) { - File perLocusStatisticsFile = deriveFromStream("sample_locus_statistics"); - File perLocusCoverageFile = deriveFromStream("sample_coverage_statistics"); - printPerLocus(perLocusStatisticsFile,perLocusCoverageFile,coverageProfiles.getCoverageBySample(),false); - } - - if ( useReadGroup || useBoth ) { - File perLocusRGStats = deriveFromStream("read_group_locus_statistics"); - File perLocusRGCoverage = deriveFromStream("read_group_locus_coverage"); - printPerLocus(perLocusRGStats,perLocusRGCoverage,coverageProfiles.getCoverageByReadGroup(),true); + for (CoverageAggregator.AggregationType type : aggregationTypes ) { + outputLocusFiles(coverageProfiles,type); } } } + private String agTypeToString(CoverageAggregator.AggregationType type) { + if ( type == CoverageAggregator.AggregationType.SAMPLE ) { + return "sample"; + } else if ( type == CoverageAggregator.AggregationType.READGROUP ) { + return "read_group"; + } else if ( type == CoverageAggregator.AggregationType.LIBRARY ) { + return "library"; + } else { + throw new StingException("Invalid aggregation type "+type+" sent to agTypeToString. This is a BUG!"); + } + } + + private void outputLocusFiles(CoverageAggregator coverageProfiles, CoverageAggregator.AggregationType type ) { + File locusStats = deriveFromStream(agTypeToString(type)+"_cumulative_coverage_counts"); + File coverageStats = deriveFromStream(agTypeToString(type)+"_cumulative_coverage_proportions"); + printPerLocus(locusStats,coverageStats,coverageProfiles.getCoverageByAggregationType(type),agTypeToString(type)); + } + + private void outputSummaryFiles(CoverageAggregator coverageProfiles, CoverageAggregator.AggregationType type ) { + File summaryFile = deriveFromStream(agTypeToString(type)+"_summary"); + File statsFile = deriveFromStream(agTypeToString(type)+"_statistics"); + printPerSample(out,statsFile,coverageProfiles.getCoverageByAggregationType(type)); + printSummary(out,summaryFile,coverageProfiles.getCoverageByAggregationType(type)); + } + public File deriveFromStream(String append) { String name = getToolkit().getArguments().outFileName; if ( name == null || name.contains("stdout") || name.contains("Stdout") || name.contains("STDOUT")) { @@ -568,7 +596,7 @@ public class DepthOfCoverageWalker extends LocusWalker, Covera } } - private void printPerLocus(File locusFile, File coverageFile, DepthOfCoverageStats stats, boolean isRG) { + private void printPerLocus(File locusFile, File coverageFile, DepthOfCoverageStats stats, String partitionType) { PrintStream output = getCorrectStream(out,locusFile); PrintStream coverageOut = getCorrectStream(out,coverageFile); if ( output == null ) { @@ -587,11 +615,7 @@ public class DepthOfCoverageWalker extends LocusWalker, Covera StringBuilder header = new StringBuilder(); if ( printSampleColumnHeader ) { - if ( isRG ) { - header.append("ReadGroup"); - } else { - header.append("Sample"); - } + header.append(partitionType); } header.append(String.format("%sgte_0",separator)); for ( int d : endpoints ) { @@ -689,24 +713,34 @@ public class DepthOfCoverageWalker extends LocusWalker, Covera return bin; } - private void printDepths(PrintStream stream, Map countsBySample, Set allSamples) { + private void printDepths(PrintStream stream, Map> countsBySampleByType, Map> identifiersByType) { // get the depths per sample and build up the output string while tabulating total and average coverage - // todo -- update me to deal with base counts/indels StringBuilder perSampleOutput = new StringBuilder(); int tDepth = 0; - for ( String s : allSamples ) { - perSampleOutput.append(separator); - long dp = countsBySample.keySet().contains(s) ? sumArray(countsBySample.get(s)) : 0; - perSampleOutput.append(dp); - if ( printBaseCounts ) { + boolean depthCounted = false; + for (CoverageAggregator.AggregationType type : aggregationTypes ) { + Map countsByID = countsBySampleByType.get(type); + for ( String s : identifiersByType.get(type) ) { perSampleOutput.append(separator); - perSampleOutput.append(baseCounts(countsBySample.get(s))); + long dp = countsByID.keySet().contains(s) ? sumArray(countsByID.get(s)) : 0 ; + perSampleOutput.append(dp); + if ( printBaseCounts ) { + perSampleOutput.append(separator); + perSampleOutput.append(baseCounts(countsByID.get(s))); + } + if ( ! depthCounted ) { + tDepth += dp; + } } - tDepth += dp; + depthCounted = true; // only sum the total depth once } + // remember -- genome locus was printed in map() - stream.printf("%s%d%s%.2f%s%n",separator,tDepth,separator,( (double) tDepth/ (double) allSamples.size()), - perSampleOutput); + stream.printf("%s%d",separator,tDepth); + for (CoverageAggregator.AggregationType type : aggregationTypes ) { + stream.printf("%s%.2f",separator, ( (double) tDepth / identifiersByType.get(type).size() ) ); + } + stream.printf("%s%n",perSampleOutput); } private long sumArray(int[] array) { @@ -737,115 +771,99 @@ public class DepthOfCoverageWalker extends LocusWalker, Covera return s.toString(); } + + private void checkOrder(CoverageAggregator ag) { + for (CoverageAggregator.AggregationType t : aggregationTypes ) { + List order = orderCheck.get(t); + List namesInAg = ag.getIdentifiersByType().get(t); + Set namesInDOCS = ag.getCoverageByAggregationType(t).getAllSamples(); + int index = 0; + for ( String s : namesInAg ) { + if ( ! s.equalsIgnoreCase(order.get(index)) ) { + throw new StingException("IDs are out of order for type "+t+"! Aggregator has different ordering"); + } + index++; + } + } + } } class CoverageAggregator { - private DepthOfCoverageStats coverageByRead; - private DepthOfCoverageStats coverageBySample; - enum AggregationType { READ, SAMPLE, BOTH } + enum AggregationType { READGROUP, SAMPLE, LIBRARY } - private AggregationType agType; - private Set sampleNames; - private Set allSamples; - - public CoverageAggregator(AggregationType type, int start, int stop, int nBins) { - if ( type == AggregationType.READ || type == AggregationType.BOTH) { - coverageByRead = new DepthOfCoverageStats(DepthOfCoverageStats.calculateBinEndpoints(start,stop,nBins)); + private List types; + private Map coverageProfiles; + private Map> identifiersByType; + private Set allIdentifiers; + public CoverageAggregator(List typesToUse, int start, int stop, int nBins) { + coverageProfiles = new HashMap(); + identifiersByType = new HashMap>(); + types = typesToUse; + for ( AggregationType type : types ) { + coverageProfiles.put(type,new DepthOfCoverageStats(DepthOfCoverageStats.calculateBinEndpoints(start,stop,nBins))); + identifiersByType.put(type,new ArrayList()); } - - if ( type == AggregationType.SAMPLE || type == AggregationType.BOTH) { - coverageBySample = new DepthOfCoverageStats(DepthOfCoverageStats.calculateBinEndpoints(start,stop,nBins)); - } - - agType = type; - allSamples = new HashSet(); + allIdentifiers = new HashSet(); } public void merge(CoverageAggregator otherAggregator) { - if ( agType == AggregationType.SAMPLE || agType == AggregationType.BOTH ) { - this.coverageBySample.merge(otherAggregator.coverageBySample); - } - - if ( agType == AggregationType.READ || agType == AggregationType.BOTH) { - this.coverageByRead.merge(otherAggregator.coverageByRead); + for ( AggregationType type : types ) { + this.coverageProfiles.get(type).merge(otherAggregator.coverageProfiles.get(type)); } } - public DepthOfCoverageStats getCoverageByReadGroup() { - return coverageByRead; + public DepthOfCoverageStats getCoverageByAggregationType(AggregationType t) { + return coverageProfiles.get(t); } - public DepthOfCoverageStats getCoverageBySample() { - return coverageBySample; - } - - public void addSamples(Set samples) { - for ( String s : samples ) { - coverageBySample.addSample(s); - allSamples.add(s); - } - - if ( agType == AggregationType.BOTH ) { - sampleNames = samples; + public void addIdentifiers(AggregationType t, Set ids) { + for ( String s : ids ) { + coverageProfiles.get(t).addSample(s); + identifiersByType.get(t).add(s); + allIdentifiers.add(s); } + Collections.sort(identifiersByType.get(t)); } - public void addReadGroups(Set readGroupNames) { - for ( String s : readGroupNames ) { - coverageByRead.addSample(s); - allSamples.add(s); - } - } - - public void initialize(boolean useDels, boolean omitLocusTable) { - if ( agType == AggregationType.SAMPLE || agType == AggregationType.BOTH ) { + for ( AggregationType t : types ) { if ( useDels ) { - coverageBySample.initializeDeletions(); + coverageProfiles.get(t).initializeDeletions(); } - if ( ! omitLocusTable ) { - coverageBySample.initializeLocusCounts(); - } - } - - if ( agType == AggregationType.READ || agType == AggregationType.BOTH) { - if ( useDels ) { - coverageByRead.initializeDeletions(); - } - - if ( ! omitLocusTable ) { - coverageByRead.initializeLocusCounts(); + coverageProfiles.get(t).initializeLocusCounts(); } } } - public void update(Map countsByIdentifier) { - if ( agType != AggregationType.BOTH ) { - if ( agType == AggregationType.SAMPLE ) { - coverageBySample.update(countsByIdentifier); - } else { - coverageByRead.update(countsByIdentifier); - } - } else { // have to separate samples and read groups - HashMap countsBySample = new HashMap(sampleNames.size()); - HashMap countsByRG = new HashMap(allSamples.size()-sampleNames.size()); - for ( String s : countsByIdentifier.keySet() ) { - if ( sampleNames.contains(s) ) { - countsBySample.put(s,countsByIdentifier.get(s)); - } else { // cannot use .remove() to save time due to concurrency issues - countsByRG.put(s,countsByIdentifier.get(s)); - } - } - - coverageBySample.update(countsBySample); - coverageByRead.update(countsByRG); + public void update(Map> countsByIdentifierByType) { + for ( AggregationType t : types ) { + coverageProfiles.get(t).update(countsByIdentifierByType.get(t)); } } - public Set getAllSamples() { - return allSamples; + public Set getAllIdentifiers() { + return allIdentifiers; } + public Map> getIdentifiersByType() { + return identifiersByType; + } + + public static AggregationType typeStringToAggregationType(String type) { + if ( type.equals("sample") ) { + return AggregationType.SAMPLE; + } + + if ( type.equals("library") ) { + return AggregationType.LIBRARY; + } + + if ( type.equals("readgroup") ) { + return AggregationType.READGROUP; + } + + throw new StingException("Valid partition type string "+type+" is not associated with an aggregation type. This is a BUG!"); + } } \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/DepthOfCoverageIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/DepthOfCoverageIntegrationTest.java index 9427daab1..d6c43e5a8 100644 --- a/java/test/org/broadinstitute/sting/gatk/walkers/DepthOfCoverageIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/DepthOfCoverageIntegrationTest.java @@ -53,21 +53,29 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest { String[] intervals = {"/humgen/gsa-hpprojects/GATK/data/Validation_Data/fhs_jhs_30_targts.interval_list"}; String[] bams = {"/humgen/gsa-hpprojects/GATK/data/Validation_Data/FHS_indexed_subset.bam"}; - String cmd = buildRootCmd(hg18,new ArrayList(Arrays.asList(bams)),new ArrayList(Arrays.asList(intervals))) + " -mmq 0 -mbq 0 -dels -baseCounts -both --outputFormat csv"; + String cmd = buildRootCmd(hg18,new ArrayList(Arrays.asList(bams)),new ArrayList(Arrays.asList(intervals))) + " -mmq 0 -mbq 0 -dels -baseCounts -pt readgroup -pt sample -pt library --outputFormat csv"; WalkerTestSpec spec = new WalkerTestSpec(cmd,0, new ArrayList()); // now add the expected files that get generated - spec.addAuxFile("f53ddf25c2b71e46381f9c434402d88d", baseOutputFile); - spec.addAuxFile("925cc5b49286e0222bce6251d1baafc7", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_statistics")); - spec.addAuxFile("d9e812398d778f28ed12d7f3d18628e2", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); - spec.addAuxFile("80577bf378de570f84d91b0ef6004102", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_locus_statistics")); - spec.addAuxFile("3a059ad82d945643dc4e03239c4041f5", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary_statistics")); - spec.addAuxFile("f3315551081331bc322c53b11412d707", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics")); + spec.addAuxFile("fc742e346be2344557cf8c039f467508", baseOutputFile); + spec.addAuxFile("e58b701b01ec0dbe75c146295434ba3b", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_counts")); + spec.addAuxFile("b9a7748e5aec4dc06daed893c901c00d", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_proportions")); + spec.addAuxFile("848e556ec7e03e9b0398d189d7cbb4ad", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_statistics")); + spec.addAuxFile("e6fc8f7a5fcc440e21de5891f3403d5d", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary")); + spec.addAuxFile("cac8e7a688d9bbe781232c61091d3237", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_statistics")); + spec.addAuxFile("e452dfb5581a7aafaf2122c5ae497f1b", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_summary")); + spec.addAuxFile("38fb89e1bb52d0342f97f72e86956b79", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_counts")); + spec.addAuxFile("f9f2941ee39577ac2f80668e7f6b3d4b", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_proportions")); spec.addAuxFile("7b9d0e93bf5b5313995be7010ef1f528", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_statistics")); spec.addAuxFile("fd29fe0c14351e934a6fef9df1f38f7b", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_summary")); - spec.addAuxFile("111261f0e8ccf8c456d0b2a9482bc32c", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_locus_statistics")); spec.addAuxFile("cc7ee5075a932dba486e78824ca34202", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_statistics")); spec.addAuxFile("e1653480daa2d96f7c584ed4cd20c147", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_summary")); + spec.addAuxFile("529353375d23c529228b38119c51e269", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_counts")); + spec.addAuxFile("650ee3714da7fbad7832c9d4ad49eb51", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_proportions")); + spec.addAuxFile("925cc5b49286e0222bce6251d1baafc7", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_statistics")); + spec.addAuxFile("d9e812398d778f28ed12d7f3d18628e2", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); + spec.addAuxFile("f3315551081331bc322c53b11412d707", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics")); + spec.addAuxFile("3a059ad82d945643dc4e03239c4041f5", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary")); execute("testBaseOutputNoFiltering",spec); }