From 88a06ad81fd50cb7abb514d86a321dae4982d204 Mon Sep 17 00:00:00 2001 From: chartl Date: Wed, 26 May 2010 03:39:22 +0000 Subject: [PATCH] Changes to Depth of Coverage: - For speedup in large number of samples, base counts are done on a per read group level, then merged into counts on larger partitions (samples, libraries, etc) + passed all integration tests before next item - Added additional summary item, a coverage threshold. Set by (possibly multiple) -ct flags, the summary outputs will have columns for "%_bases_covered_to_X"; both per sample, and per sample per interval summary files are effected (thus md5s changed for these) NOTE: This is the last revision that will include the per-gene summary files. Once DesignFileGenerator is sufficiently general, and has integration tests, it will be moved to core and the per-gene summary from Depth of Coverage will be retired. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3437 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/coverage/CoverageUtils.java | 57 ++++++++++++++----- .../coverage/DepthOfCoverageStats.java | 14 +++++ .../coverage/DepthOfCoverageWalker.java | 49 ++++++++++++++-- .../walkers/DesignFileGeneratorWalker.java | 44 +++++++------- .../DepthOfCoverageB36IntegrationTest.java | 4 +- .../DepthOfCoverageIntegrationTest.java | 16 +++--- 6 files changed, 135 insertions(+), 49 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java b/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java index 1c8958777..7a2251078 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.coverage; +import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.BaseUtils; @@ -40,39 +41,67 @@ public class CoverageUtils { return counts; } - public static String getTypeID( SAMRecord r, CoverageAggregator.AggregationType type ) { + public static String getTypeID( SAMReadGroupRecord r, CoverageAggregator.AggregationType type ) { if ( type == CoverageAggregator.AggregationType.SAMPLE ) { - return r.getReadGroup().getSample(); + return r.getSample(); } else if ( type == CoverageAggregator.AggregationType.READGROUP ) { - return String.format("%s_rg_%s",r.getReadGroup().getSample(),r.getReadGroup().getReadGroupId()); + return String.format("%s_rg_%s",r.getSample(),r.getReadGroupId()); } else if ( type == CoverageAggregator.AggregationType.LIBRARY ) { - return r.getReadGroup().getLibrary(); + return r.getLibrary(); } else { throw new StingException("Invalid type ID sent to getTypeID. This is a BUG!"); } } + public static Map> getBaseCountsByPartition(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, List types) { Map> countsByIDByType = new HashMap>(); + Map countsByRG = getBaseCountsByReadGroup(context,minMapQ,maxMapQ,minBaseQ,maxBaseQ); for (CoverageAggregator.AggregationType t : types ) { - countsByIDByType.put(t,new HashMap()); - } - for (PileupElement e : context.getBasePileup()) { - if ( e.getMappingQual() >= minMapQ && e.getMappingQual() <= maxMapQ && ( ( e.getQual() >= minBaseQ && e.getQual() <= maxBaseQ ) || e.isDeletion() ) ) { - for (CoverageAggregator.AggregationType t : types ) { - String id = getTypeID(e.getRead(),t); - if ( ! countsByIDByType.get(t).keySet().contains(id) ) { - countsByIDByType.get(t).put(id,new int[6]); - } - updateCounts(countsByIDByType.get(t).get(id),e); + // iterate through the read group counts and build the type associations + for ( Map.Entry readGroupCountEntry : countsByRG.entrySet() ) { + String typeID = getTypeID(readGroupCountEntry.getKey(),t); + + if ( ! countsByIDByType.keySet().contains(t) ) { + countsByIDByType.put(t,new HashMap()); + } + + if ( ! countsByIDByType.get(t).keySet().contains(typeID) ) { + countsByIDByType.get(t).put(typeID,readGroupCountEntry.getValue().clone()); + } else { + addCounts(countsByIDByType.get(t).get(typeID),readGroupCountEntry.getValue()); } } } + return countsByIDByType; } + public static void addCounts(int[] updateMe, int[] leaveMeAlone ) { + for ( int index = 0; index < leaveMeAlone.length; index++ ) { + updateMe[index] += leaveMeAlone[index]; + } + } + + public static Map getBaseCountsByReadGroup(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ) { + Map countsByRG = new HashMap(); + for ( PileupElement e : context.getBasePileup() ) { + if ( e.getMappingQual() >= minMapQ && e.getMappingQual() <= maxMapQ && ( e.getQual() >= minBaseQ && e.getQual() <= maxBaseQ || e.isDeletion() ) ) { + SAMReadGroupRecord readGroup = e.getRead().getReadGroup(); + if ( ! countsByRG.keySet().contains(readGroup) ) { + countsByRG.put(readGroup,new int[6]); + updateCounts(countsByRG.get(readGroup),e); + } else { + updateCounts(countsByRG.get(readGroup),e); + } + } + } + + return countsByRG; + } + private static void updateCounts(int[] counts, PileupElement e) { if ( e.isDeletion() ) { counts[BaseUtils.DELETION_INDEX]++; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java b/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java index 0b1b2c9fe..718c3d3d1 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java @@ -166,6 +166,10 @@ public class DepthOfCoverageStats { } public void update(Map countsBySample) { + if ( countsBySample == null ) { + this.updateDepths(new HashMap(1)); + return; + } // todo -- do we want to do anything special regarding base count or deletion statistics? HashMap depthBySample = new HashMap(); // todo -- needs fixing with advent of new baseutils functionality using ENUMS and handling N,D @@ -309,4 +313,14 @@ public class DepthOfCoverageStats { return distribution; } + public int value2bin(int value) { + for ( int index = 0; index < binLeftEndpoints.length; index++ ) { + if ( binLeftEndpoints[index] >= value ) { + return index; + } + } + + return binLeftEndpoints.length-1; + } + } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java index a59b0cfe8..2276b1a21 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java @@ -104,6 +104,8 @@ public class DepthOfCoverageWalker extends LocusWalker histograms = stats.getHistograms(); Map means = stats.getMeans(); Map totals = stats.getTotals(); @@ -687,10 +704,16 @@ public class DepthOfCoverageWalker extends LocusWalker> countsBySampleByType, Map> identifiersByType) { // get the depths per sample and build up the output string while tabulating total and average coverage StringBuilder perSampleOutput = new StringBuilder(); @@ -726,11 +763,11 @@ public class DepthOfCoverageWalker extends LocusWalker countsByID = countsBySampleByType.get(type); for ( String s : identifiersByType.get(type) ) { perSampleOutput.append(separator); - long dp = countsByID.keySet().contains(s) ? sumArray(countsByID.get(s)) : 0 ; + long dp = (countsByID != null && countsByID.keySet().contains(s)) ? sumArray(countsByID.get(s)) : 0 ; perSampleOutput.append(dp); if ( printBaseCounts ) { perSampleOutput.append(separator); - perSampleOutput.append(baseCounts(countsByID.get(s))); + perSampleOutput.append(baseCounts(countsByID != null ? countsByID.get(s) : null )); } if ( ! depthCounted ) { tDepth += dp; diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DesignFileGeneratorWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DesignFileGeneratorWalker.java index 43ee6d215..5b6e1ad93 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DesignFileGeneratorWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DesignFileGeneratorWalker.java @@ -5,6 +5,7 @@ import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeatureIterator; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.gatk.walkers.RefWalker; @@ -23,8 +24,7 @@ import java.util.*; * Was written in order to annotate the Whole Exome Agilent designs at the Broad institute * Bind the refseq rod as -B refseq,refseq,/path/to/refGene.txt * Bind the interval list as -B interval_list,intervals,/path/to/intervals.interval_list - * Bind the tcga (or other .bed) file as -B tcga,bed,/path/to/tcga.bed - * [TCGA used as the agilent designs used both refseq and TCGA6k definitions] + * Bind the additional files file as -B gene*,bed,/path/to/other/file.bed * @Author chartl * @Date Apr 26, 2010 */ @@ -32,17 +32,17 @@ public class DesignFileGeneratorWalker extends RodWalker { private HashMap intervalBuffer = new HashMap(); private HashSet refseqBuffer = new HashSet(); - private BEDFeature currentTCGA = null; + private HashMap currentBedFeatures; public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - // three items to look up: interval_list, tcga, and refseq + // three items to look up: interval_list, refseq, gene* if ( tracker == null ) { return null; } List intervalsList= tracker.getReferenceMetaData("interval_list"); List refseqList = tracker.getReferenceMetaData("refseq"); - List tcgaList = tracker.getReferenceMetaData("tcga"); + List bedList = tracker.getGATKFeatureMetaData("gene",false); // put any unprocessed intervals into the interval buffer @@ -65,10 +65,9 @@ public class DesignFileGeneratorWalker extends RodWalker { } } - // update the current tcga target - - if ( tcgaList != null && tcgaList.size() > 0 ) { - currentTCGA = (BEDFeature) tcgaList.get(0); + // update the bed features + for ( GATKFeature additionalGene : bedList ) { + currentBedFeatures.put(additionalGene.getName(),(BEDFeature) additionalGene.getUnderlyingObject()); } cleanup(ref); @@ -96,14 +95,18 @@ public class DesignFileGeneratorWalker extends RodWalker { } } - if ( currentTCGA != null && - interval.overlapsP(GenomeLocParser.createGenomeLoc(currentTCGA.getChr(),currentTCGA.getStart(),currentTCGA.getEnd())) && - !currentTCGA.getName().equals("") && - ! intervalBuffer.get(interval).geneNames.contains("TCGA_"+currentTCGA.getName()) ) { + for ( Map.Entry additionalGenes : currentBedFeatures.entrySet() ) { + GenomeLoc entryLoc = GenomeLocParser.createGenomeLoc(additionalGenes.getValue().getChr(),additionalGenes.getValue().getStart(),additionalGenes.getValue().getEnd()); + if ( interval.overlapsP(entryLoc) && + ! additionalGenes.getValue().getName().equals("") && + ! intervalBuffer.get(interval).geneNames.contains(additionalGenes.getKey()+"_"+additionalGenes.getValue().getName())) { + + intervalBuffer.get(interval).update(additionalGenes.getKey()+"_"+additionalGenes.getValue().getName(), + new ArrayList(Arrays.asList(entryLoc)), + null); + nUpdate ++; + } - intervalBuffer.get(interval).update("TCGA_"+currentTCGA.getName().split("_f|_r")[0], - new ArrayList(Arrays.asList(GenomeLocParser.createGenomeLoc(currentTCGA.getChr(),currentTCGA.getStart(),currentTCGA.getEnd()))), - new ArrayList(Arrays.asList(Integer.parseInt(currentTCGA.getName().split("_f|_r")[1])-1))); } } @@ -139,8 +142,11 @@ public class DesignFileGeneratorWalker extends RodWalker { intervalBuffer.remove(interval); } - if ( currentTCGA != null && GenomeLocParser.createGenomeLoc(currentTCGA.getChr(),currentTCGA.getStart(),currentTCGA.getEnd()).isBefore(ref.getLocus()) ) { - currentTCGA = null; + for ( Map.Entry entry : currentBedFeatures.entrySet() ) { + GenomeLoc entryLoc = GenomeLocParser.createGenomeLoc(entry.getValue().getChr(),entry.getValue().getStart(),entry.getValue().getEnd()); + if ( entryLoc.isBefore(ref.getLocus()) ) { + currentBedFeatures.remove(entry.getKey()); + } } } @@ -182,7 +188,7 @@ class IntervalInfoBuilder { public void update(String gene, List exons, List exonNumbers) { if ( geneNames.contains(gene) ) { - if ( gene.startsWith("TCGA") ) { + if ( gene.startsWith("gene") ) { // exons are split up one per bed, so update the exon list for this gene for ( int eOff = 0; eOff < exons.size(); eOff++) { if ( ! exonNumbersByGene.get(gene).contains( exonNumbers.get(eOff) ) ) { diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java index ef812de05..f28bf2e94 100644 --- a/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java @@ -67,9 +67,9 @@ public class DepthOfCoverageB36IntegrationTest extends WalkerTest { spec.addAuxFile("6b15f5330414b6d4e2f6caea42139fa1", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_counts")); spec.addAuxFile("cc6640d82077991dde8a2b523935cdff", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_proportions")); spec.addAuxFile("0fb627234599c258a3fee1b2703e164a", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_statistics")); - spec.addAuxFile("6f6085293e8cca402ef4fe648cf06ea8", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); + spec.addAuxFile("3ad34680c216279c52c7fa0c9e078249", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); spec.addAuxFile("347b47ef73fbd4e277704ddbd7834f69", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics")); - spec.addAuxFile("ff0c79d161259402d54d572151582697", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary")); + spec.addAuxFile("65bc90b4839cb9f20e60aba4956f06a9", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary")); execute("testMapQ0Only",spec); } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java index b6f4d13d0..ce26569af 100644 --- a/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java @@ -53,7 +53,7 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest { String[] intervals = {"/humgen/gsa-hpprojects/GATK/data/Validation_Data/fhs_jhs_30_targts.interval_list"}; String[] bams = {"/humgen/gsa-hpprojects/GATK/data/Validation_Data/FHS_indexed_subset.bam"}; - String cmd = buildRootCmd(hg18,new ArrayList(Arrays.asList(bams)),new ArrayList(Arrays.asList(intervals))) + " -mmq 0 -mbq 0 -dels -baseCounts -pt readgroup -pt sample -pt library --outputFormat csv"; + String cmd = buildRootCmd(hg18,new ArrayList(Arrays.asList(bams)),new ArrayList(Arrays.asList(intervals))) + " -mmq 0 -mbq 0 -dels -baseCounts -pt readgroup -pt sample -pt library --outputFormat csv -ct 10 -ct 15 -ct 20 -ct 25"; WalkerTestSpec spec = new WalkerTestSpec(cmd,0, new ArrayList()); // now add the expected files that get generated @@ -61,21 +61,21 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest { spec.addAuxFile("e58b701b01ec0dbe75c146295434ba3b", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_counts")); spec.addAuxFile("b9a7748e5aec4dc06daed893c901c00d", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_proportions")); spec.addAuxFile("848e556ec7e03e9b0398d189d7cbb4ad", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_statistics")); - spec.addAuxFile("e6fc8f7a5fcc440e21de5891f3403d5d", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary")); + spec.addAuxFile("acd3dfb97ef64ea6547c001640acd194", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary")); spec.addAuxFile("cac8e7a688d9bbe781232c61091d3237", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_statistics")); - spec.addAuxFile("e452dfb5581a7aafaf2122c5ae497f1b", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_summary")); + spec.addAuxFile("73c31412b75dc7014549096de7c0c609", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_summary")); spec.addAuxFile("38fb89e1bb52d0342f97f72e86956b79", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_counts")); spec.addAuxFile("f9f2941ee39577ac2f80668e7f6b3d4b", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_proportions")); spec.addAuxFile("7b9d0e93bf5b5313995be7010ef1f528", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_statistics")); - spec.addAuxFile("fd29fe0c14351e934a6fef9df1f38f7b", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_summary")); + spec.addAuxFile("8c5266ef3c0031d3b0311839b7f59245", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_summary")); spec.addAuxFile("cc7ee5075a932dba486e78824ca34202", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_statistics")); - spec.addAuxFile("e1653480daa2d96f7c584ed4cd20c147", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_summary")); + spec.addAuxFile("39afa53bae210f4684951b908aa36c7d", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_summary")); spec.addAuxFile("529353375d23c529228b38119c51e269", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_counts")); spec.addAuxFile("650ee3714da7fbad7832c9d4ad49eb51", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_proportions")); spec.addAuxFile("925cc5b49286e0222bce6251d1baafc7", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_statistics")); - spec.addAuxFile("d9e812398d778f28ed12d7f3d18628e2", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); + spec.addAuxFile("7a2fc8a2fba91adfbdc90dcc0e1ef79b", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); spec.addAuxFile("f3315551081331bc322c53b11412d707", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics")); - spec.addAuxFile("3a059ad82d945643dc4e03239c4041f5", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary")); + spec.addAuxFile("455f9f0c4461bcf0b231fef704266881", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary")); execute("testBaseOutputNoFiltering",spec); } @@ -92,7 +92,7 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec(cmd,0, new ArrayList()); spec.addAuxFile("d570c27d82a80ebd2852e9d34aff4e87",baseOutputFile); - spec.addAuxFile("00107eea991f7379771b29dea0c859cb",createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary")); + spec.addAuxFile("0ee40f3e5091536c14e077b77557083a",createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary")); execute("testNoCoverageDueToFiltering",spec); }