diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java index e52c58187..eb8b17ee4 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java @@ -183,7 +183,7 @@ public class FisherStrand extends StrandBiasTest implements StandardAnnotation, return list; } - private Double pValueForContingencyTable(int[][] originalTable) { + public static Double pValueForContingencyTable(int[][] originalTable) { final int[][] normalizedTable = normalizeContingencyTable(originalTable); int[][] table = copyContingencyTable(normalizedTable); diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/ASEReadCounter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/ASEReadCounter.java new file mode 100644 index 000000000..f1f9c55d8 --- /dev/null +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/ASEReadCounter.java @@ -0,0 +1,286 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.tools.walkers.rnaseq; + +import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.engine.filters.DuplicateReadFilter; +import org.broadinstitute.gatk.engine.walkers.DisabledReadFilters; +import org.broadinstitute.gatk.engine.walkers.Downsample; +import org.broadinstitute.gatk.engine.walkers.LocusWalker; +import org.broadinstitute.gatk.tools.walkers.coverage.CoverageUtils; +import org.broadinstitute.gatk.utils.commandline.*; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.pileup.PileupElement; +import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; + +import java.io.PrintStream; +import java.util.List; + +/** + * Calculated allele counts in specific loci, to allow allele specific expression analysis. + * + *

+ * Calculated allele counts in specific loci after filtering based on mapping quality, base quality, depth of coverage, overlapping paired reads and deletion on the position. + * All thresholds and option can be set as command line arguments. + * The output of this tool is a tab delimited txt file, compatible with Mamba, a downstream tool (http://www.well.ox.ac.uk/~rivas/mamba/). + * A user can use -drf DuplicateRead to allow the tool to count also duplicated reads (a useful option for RNAseq data tools) + *

+ * + *

Input

+ *

+ * BAM files (with proper headers) to be analyzed for ASE *

+ *

+ * a VCF file with specific sites to process + *

+ *

+ *

Output

+ *

+ * Table (tab delimited file) with the specific allele counts + *

+ * + *

Examples

+ *
+ * java -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T ASEReadCounter \
+ *   -o file_name \
+ *   -I input.bam \
+ *   -sites sites.vcf
+ *   -U ALLOW_N_CIGAR_READS
+ *   [-minDepth 10] \
+ *   [--minMappingQuality 10] \
+ *   [--minBaseQuality 2]
+ *   [-drf DuplicateRead]
+ * 
+ */ +@Downsample(by = DownsampleType.BY_SAMPLE, toCoverage = 10000) +//@DisabledReadFilters({DuplicateReadFilter.class}) //currently can be disabled using the command line argument -drf DuplicateRead +public class ASEReadCounter extends LocusWalker { + + @Output + public PrintStream out; + + @Input (fullName = "sitesVCFFile",shortName = "sites") + public RodBinding sites; + + /** + * Loci with total depth lower then this threshold will be skipped. This is set to -1 by default to disable the evaluation and ignore this threshold. + */ + @Argument(fullName = "minDepthOfNonFilteredBase", shortName = "minDepth", doc = "Minimum number of filtered-pass bases that we need to see for reporting this site", required = false, minValue = 0, maxValue = Integer.MAX_VALUE) + public int minDepthOfNonFilteredBases = -1; + + /** + * Reads with mapping quality values lower than this threshold will be skipped. This is set to -1 by default to disable the evaluation and ignore this threshold. + */ + @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth", required = false, minValue = 0, maxValue = Integer.MAX_VALUE) + public int minMappingQuality = 0; + + /** + * Bases with quality scores lower than this threshold will be skipped. This is set to -1 by default to disable the evaluation and ignore this threshold. + */ + @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth", required = false, minValue = 0, maxValue = Byte.MAX_VALUE) + public byte minBaseQuality = 0; + + /** + * There are few option to deal with an overlapping read pair. + * COUNT_READS - Count all reads independently (even if from the same fragment). + * COUNT_FRAGMENTS - Count all fragments (even if the reads that compose the fragment are not consistent at that base). + * COUNT_FRAGMENTS_REQUIRE_SAME_BASEQUIRE_SAME_BASE - Count all fragments (but only if the reads that compose the fragment are consistent at that base). + * defaults to COUNT_FRAGMENTS_REQUIRE_SAME_BASEQUIRE_SAME_BASE + */ + @Argument(fullName = "countOverlapReadsType", shortName = "overlap", doc = "How should overlapping reads from the same fragment be handled. Current options: COUNT_READS, COUNT_FRAGMENTS and COUNT_FRAGMENTS_REQUIRE_SAME_BASEQUIRE_SAME_BASE (default)", required = false) + public CoverageUtils.CountPileupType countType = CoverageUtils.CountPileupType.COUNT_FRAGMENTS_REQUIRE_SAME_BASE; + + /** + * Output file format (e.g. csv, table, rtable); defaults to r-readable table. + */ + @Argument(fullName = "outputFormat", doc = "The format of the output file, can be csv, table, rtable", required = false) + public String outputFormat = "rtable"; + + /** + * Consider a spanning deletion as contributing to coverage. Also enables deletion counts in per-base output. + */ + @Advanced + @Argument(fullName = "includeDeletions", shortName = "dels", doc = "Include information on deletions", required = false) + public boolean includeDeletions = false; + + @Advanced + @Argument(fullName = "ignoreDeletionSites", doc = "Ignore sites consisting only of deletions", required = false) + public boolean ignoreDeletionSites = false; + + final String[] OUTPUT_FORMATS = {"table","rtable","csv"}; + public String separator = "\t"; + + //////////////////////////////////////////////////////////////////////////////////// + // STANDARD WALKER METHODS + //////////////////////////////////////////////////////////////////////////////////// + + public boolean includeReadsWithDeletionAtLoci() { return includeDeletions && ! ignoreDeletionSites; } + + public void initialize() { + + // Check the output format + boolean goodOutputFormat = false; + for ( final String f : OUTPUT_FORMATS ) { + goodOutputFormat = goodOutputFormat || f.equals(outputFormat); + } + + if ( ! goodOutputFormat ) { + throw new IllegalArgumentException("Improper output format. Can be one of table,rtable,csv. Was "+outputFormat); + } + + if ( outputFormat.equals("csv") ) { + separator = ","; + } + final String header = "contig"+separator+"position"+separator+"variantID"+separator+"refAllele"+separator+"altAllele"+separator+"refCount"+separator+"altCount"+separator+"totalCount"+separator+"lowMAPQDepth"+separator+"lowBaseQDepth"+separator+"rawDepth"+separator+"otherBases"+separator+"improperPairs"; + out.println(header); + + } + + + @Override + public String map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { + if ( tracker == null ) + return null; + final String contig = context.getLocation().getContig(); + final long position = context.getPosition(); + + final char refAllele = (char)ref.getBase(); + + final List VCs = tracker.getValues(sites, context.getLocation()); + if(VCs != null && VCs.size() > 1) + throw new UserException("more then one variant context in position: "+contig+":"+position); + if(VCs == null || VCs.size() == 0) + return null; + + final VariantContext vc = VCs.get(0); + if(!vc.isBiallelic()) { + logger.warn("ignore site: cannot run ASE on non-biallelic sites: " + vc.toString()); + return null; + } + + final char altAllele = (char)vc.getAlternateAllele(0).getBases()[0]; + final String siteID = vc.getID(); + final ReadBackedPileup pileup = filterPileup(context.getBasePileup(), countType, includeReadsWithDeletionAtLoci()); + + // count up the depths of all and QC+ bases + return calculateLineForSite(pileup, contig, position, siteID, refAllele, altAllele); + + } + + protected ReadBackedPileup filterPileup(final ReadBackedPileup originalPileup, final CoverageUtils.CountPileupType countType, final boolean includeDeletions){ + + ReadBackedPileup pileupWithDeletions; + if(countType.equals(CoverageUtils.CountPileupType.COUNT_FRAGMENTS_REQUIRE_SAME_BASE)) + pileupWithDeletions = originalPileup.getOverlappingFragmentFilteredPileup(true,true); + else if(countType.equals(CoverageUtils.CountPileupType.COUNT_READS)) + pileupWithDeletions = originalPileup; + else if(countType.equals(CoverageUtils.CountPileupType.COUNT_FRAGMENTS)) + pileupWithDeletions = originalPileup.getOverlappingFragmentFilteredPileup(false,true); + else + throw new UserException("Must use valid CountPileupType"); + + return includeDeletions ? pileupWithDeletions: pileupWithDeletions.getPileupWithoutDeletions(); + + } + + protected String calculateLineForSite(final ReadBackedPileup pileup, final String contig, final long position, final String siteID, final char refAllele, final char altAllele){ + + int rawDepth = 0, lowBaseQDepth = 0, lowMAPQDepth = 0, refCount = 0, altCount = 0, totalNonFilteredCount = 0, otherBasesCount = 0, improperPairsCount = 0 ; + + for (final PileupElement base : pileup) { + rawDepth++; + + if (!base.getRead().getProperPairFlag()){ + improperPairsCount++; + continue; + } + if (base.getMappingQual() < minMappingQuality) { + lowMAPQDepth++; + continue; + } + + if (base.getQual() < minBaseQuality) { + lowBaseQDepth++; + continue; + } + + if(base.getBase() == refAllele) + refCount++; + else if(base.getBase() == altAllele) + altCount++; + else { + otherBasesCount++; + continue; + } + totalNonFilteredCount++; + } + + if(totalNonFilteredCount < minDepthOfNonFilteredBases) + return null; + + return contig +separator+ + position +separator+ + siteID +separator+ + refAllele +separator+ + altAllele +separator+ + refCount +separator+ + altCount +separator+ + totalNonFilteredCount +separator+ + lowMAPQDepth +separator+ + lowBaseQDepth +separator+ + rawDepth +separator+ + otherBasesCount +separator+ + improperPairsCount; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(String results, Integer sum) { + if(results!= null) + out.println(results); + return ++sum; + } + + @Override + public void onTraversalDone(Integer sum) { + logger.info("Done processing "+sum+" loci"); + out.close(); + } + + + + + +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/rnaseq/ASEReadCounterIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/rnaseq/ASEReadCounterIntegrationTest.java new file mode 100644 index 000000000..238ce93c7 --- /dev/null +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/rnaseq/ASEReadCounterIntegrationTest.java @@ -0,0 +1,112 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.tools.walkers.rnaseq; + +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +/** + * Test all different parameters. + */ +public class ASEReadCounterIntegrationTest extends WalkerTest { + + @Test + public void testASEReadCounterWithHighMQ() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T ASEReadCounter -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -sites "+privateTestDir +"NA12878.chr20_2444518_2637800.RNAseq.SYNONYMOUS_CODING.vcf -mmq 60 -o %s -U ALLOW_N_CIGAR_READS", 1, + Arrays.asList("eec421405a4a570751821d158734020e")); + executeTest("test high mq with no read passing", spec); + } + + @Test + public void testASEReadCounterWithLowMQ() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T ASEReadCounter -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -sites "+privateTestDir +"NA12878.chr20_2444518_2637800.RNAseq.SYNONYMOUS_CODING.vcf -mmq 1 -o %s -U ALLOW_N_CIGAR_READS", 1, + Arrays.asList("4f1144be0bb2c4adeae00625afd04cda")); + executeTest("test high mq with no read passing", spec); + } + + @Test + public void testASEReadCounterWithLowMQNoDedup() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T ASEReadCounter -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -sites "+privateTestDir +"NA12878.chr20_2444518_2637800.RNAseq.SYNONYMOUS_CODING.vcf -mmq 10 -o %s -U ALLOW_N_CIGAR_READS -drf DuplicateRead", 1, + Arrays.asList("226021673310f28d6520d7f3cfe8cb4b")); + executeTest("test high mq with no read passing", spec); + } + + @Test + public void testASEReadCounterWithHighMQLowBQ() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T ASEReadCounter -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -sites "+privateTestDir +"NA12878.chr20_2444518_2637800.RNAseq.SYNONYMOUS_CODING.vcf -mmq 60 -mbq 10 -o %s -U ALLOW_N_CIGAR_READS", 1, + Arrays.asList("f86bf14ca3a2cc0114d6a11de0cd9448")); + executeTest("test high mq with no read passing", spec); + } + + @Test + public void testASEReadCounterWithCountOverlaps() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T ASEReadCounter -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -sites "+privateTestDir +"NA12878.chr20_2444518_2637800.RNAseq.SYNONYMOUS_CODING.vcf -mmq 60 -mbq 10 -o %s -U ALLOW_N_CIGAR_READS -overlap COUNT_FRAGMENTS", 1, + Arrays.asList("bfaeaaa8c000eca82f703225bf2257a1")); + executeTest("test high mq with no read passing", spec); + } + + @Test + public void testASEReadCounterWithCountReads() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T ASEReadCounter -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -sites "+privateTestDir +"NA12878.chr20_2444518_2637800.RNAseq.SYNONYMOUS_CODING.vcf -mmq 60 -mbq 10 -o %s -U ALLOW_N_CIGAR_READS -overlap COUNT_READS", 1, + Arrays.asList("a9b420fd12a9162b31a48842c4081a1f")); + executeTest("test high mq with no read passing", spec); + } + + + @Test + public void testASEReadCounterMinDepth70() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T ASEReadCounter -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -sites "+privateTestDir +"NA12878.chr20_2444518_2637800.RNAseq.SYNONYMOUS_CODING.vcf -mmq 60 -mbq 10 -o %s -U ALLOW_N_CIGAR_READS -minDepth 70", 1, + Arrays.asList("79b99bc8a1c1c58ac860f79a11f93086")); + executeTest("test high mq with no read passing", spec); + } + + @Test + public void testASEReadCounterFileFormat() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T ASEReadCounter -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -sites "+privateTestDir +"NA12878.chr20_2444518_2637800.RNAseq.SYNONYMOUS_CODING.vcf -mmq 60 -mbq 10 -o %s -U ALLOW_N_CIGAR_READS --outputFormat csv", 1, + Arrays.asList("2c7c531018ab353e6874ee2da7980986")); + executeTest("test high mq with no read passing", spec); + } + + + + + + + + + + +}