Merge pull request #886 from broadinstitute/ami-RNA-ASEtools-march2015

Ami RNA ASE tools
This commit is contained in:
amilev 2015-03-21 18:31:24 -04:00
commit 244dbe8634
3 changed files with 399 additions and 1 deletions

View File

@ -183,7 +183,7 @@ public class FisherStrand extends StrandBiasTest implements StandardAnnotation,
return list;
}
private Double pValueForContingencyTable(int[][] originalTable) {
public static Double pValueForContingencyTable(int[][] originalTable) {
final int[][] normalizedTable = normalizeContingencyTable(originalTable);
int[][] table = copyContingencyTable(normalizedTable);

View File

@ -0,0 +1,286 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.gatk.tools.walkers.rnaseq;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.gatk.engine.filters.DuplicateReadFilter;
import org.broadinstitute.gatk.engine.walkers.DisabledReadFilters;
import org.broadinstitute.gatk.engine.walkers.Downsample;
import org.broadinstitute.gatk.engine.walkers.LocusWalker;
import org.broadinstitute.gatk.tools.walkers.coverage.CoverageUtils;
import org.broadinstitute.gatk.utils.commandline.*;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.downsampling.DownsampleType;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.pileup.PileupElement;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import java.io.PrintStream;
import java.util.List;
/**
* Calculated allele counts in specific loci, to allow allele specific expression analysis.
*
* <p>
* Calculated allele counts in specific loci after filtering based on mapping quality, base quality, depth of coverage, overlapping paired reads and deletion on the position.
* All thresholds and option can be set as command line arguments.
* The output of this tool is a tab delimited txt file, compatible with Mamba, a downstream tool (http://www.well.ox.ac.uk/~rivas/mamba/).
* A user can use -drf DuplicateRead to allow the tool to count also duplicated reads (a useful option for RNAseq data tools)
* </p>
*
* <h3>Input</h3>
* <p>
* BAM files (with proper headers) to be analyzed for ASE * </p>
* <p>
* a VCF file with specific sites to process
* </p>
* </p></p>
* <h3>Output</h3>
* <p>
* Table (tab delimited file) with the specific allele counts
* </p>
*
* <h3>Examples</h3>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T ASEReadCounter \
* -o file_name \
* -I input.bam \
* -sites sites.vcf
* -U ALLOW_N_CIGAR_READS
* [-minDepth 10] \
* [--minMappingQuality 10] \
* [--minBaseQuality 2]
* [-drf DuplicateRead]
* </pre>
*/
@Downsample(by = DownsampleType.BY_SAMPLE, toCoverage = 10000)
//@DisabledReadFilters({DuplicateReadFilter.class}) //currently can be disabled using the command line argument -drf DuplicateRead
public class ASEReadCounter extends LocusWalker<String, Integer> {
@Output
public PrintStream out;
@Input (fullName = "sitesVCFFile",shortName = "sites")
public RodBinding<VariantContext> sites;
/**
* Loci with total depth lower then this threshold will be skipped. This is set to -1 by default to disable the evaluation and ignore this threshold.
*/
@Argument(fullName = "minDepthOfNonFilteredBase", shortName = "minDepth", doc = "Minimum number of filtered-pass bases that we need to see for reporting this site", required = false, minValue = 0, maxValue = Integer.MAX_VALUE)
public int minDepthOfNonFilteredBases = -1;
/**
* Reads with mapping quality values lower than this threshold will be skipped. This is set to -1 by default to disable the evaluation and ignore this threshold.
*/
@Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth", required = false, minValue = 0, maxValue = Integer.MAX_VALUE)
public int minMappingQuality = 0;
/**
* Bases with quality scores lower than this threshold will be skipped. This is set to -1 by default to disable the evaluation and ignore this threshold.
*/
@Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth", required = false, minValue = 0, maxValue = Byte.MAX_VALUE)
public byte minBaseQuality = 0;
/**
* There are few option to deal with an overlapping read pair.
* COUNT_READS - Count all reads independently (even if from the same fragment).
* COUNT_FRAGMENTS - Count all fragments (even if the reads that compose the fragment are not consistent at that base).
* COUNT_FRAGMENTS_REQUIRE_SAME_BASEQUIRE_SAME_BASE - Count all fragments (but only if the reads that compose the fragment are consistent at that base).
* defaults to COUNT_FRAGMENTS_REQUIRE_SAME_BASEQUIRE_SAME_BASE
*/
@Argument(fullName = "countOverlapReadsType", shortName = "overlap", doc = "How should overlapping reads from the same fragment be handled. Current options: COUNT_READS, COUNT_FRAGMENTS and COUNT_FRAGMENTS_REQUIRE_SAME_BASEQUIRE_SAME_BASE (default)", required = false)
public CoverageUtils.CountPileupType countType = CoverageUtils.CountPileupType.COUNT_FRAGMENTS_REQUIRE_SAME_BASE;
/**
* Output file format (e.g. csv, table, rtable); defaults to r-readable table.
*/
@Argument(fullName = "outputFormat", doc = "The format of the output file, can be csv, table, rtable", required = false)
public String outputFormat = "rtable";
/**
* Consider a spanning deletion as contributing to coverage. Also enables deletion counts in per-base output.
*/
@Advanced
@Argument(fullName = "includeDeletions", shortName = "dels", doc = "Include information on deletions", required = false)
public boolean includeDeletions = false;
@Advanced
@Argument(fullName = "ignoreDeletionSites", doc = "Ignore sites consisting only of deletions", required = false)
public boolean ignoreDeletionSites = false;
final String[] OUTPUT_FORMATS = {"table","rtable","csv"};
public String separator = "\t";
////////////////////////////////////////////////////////////////////////////////////
// STANDARD WALKER METHODS
////////////////////////////////////////////////////////////////////////////////////
public boolean includeReadsWithDeletionAtLoci() { return includeDeletions && ! ignoreDeletionSites; }
public void initialize() {
// Check the output format
boolean goodOutputFormat = false;
for ( final String f : OUTPUT_FORMATS ) {
goodOutputFormat = goodOutputFormat || f.equals(outputFormat);
}
if ( ! goodOutputFormat ) {
throw new IllegalArgumentException("Improper output format. Can be one of table,rtable,csv. Was "+outputFormat);
}
if ( outputFormat.equals("csv") ) {
separator = ",";
}
final String header = "contig"+separator+"position"+separator+"variantID"+separator+"refAllele"+separator+"altAllele"+separator+"refCount"+separator+"altCount"+separator+"totalCount"+separator+"lowMAPQDepth"+separator+"lowBaseQDepth"+separator+"rawDepth"+separator+"otherBases"+separator+"improperPairs";
out.println(header);
}
@Override
public String map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
if ( tracker == null )
return null;
final String contig = context.getLocation().getContig();
final long position = context.getPosition();
final char refAllele = (char)ref.getBase();
final List<VariantContext> VCs = tracker.getValues(sites, context.getLocation());
if(VCs != null && VCs.size() > 1)
throw new UserException("more then one variant context in position: "+contig+":"+position);
if(VCs == null || VCs.size() == 0)
return null;
final VariantContext vc = VCs.get(0);
if(!vc.isBiallelic()) {
logger.warn("ignore site: cannot run ASE on non-biallelic sites: " + vc.toString());
return null;
}
final char altAllele = (char)vc.getAlternateAllele(0).getBases()[0];
final String siteID = vc.getID();
final ReadBackedPileup pileup = filterPileup(context.getBasePileup(), countType, includeReadsWithDeletionAtLoci());
// count up the depths of all and QC+ bases
return calculateLineForSite(pileup, contig, position, siteID, refAllele, altAllele);
}
protected ReadBackedPileup filterPileup(final ReadBackedPileup originalPileup, final CoverageUtils.CountPileupType countType, final boolean includeDeletions){
ReadBackedPileup pileupWithDeletions;
if(countType.equals(CoverageUtils.CountPileupType.COUNT_FRAGMENTS_REQUIRE_SAME_BASE))
pileupWithDeletions = originalPileup.getOverlappingFragmentFilteredPileup(true,true);
else if(countType.equals(CoverageUtils.CountPileupType.COUNT_READS))
pileupWithDeletions = originalPileup;
else if(countType.equals(CoverageUtils.CountPileupType.COUNT_FRAGMENTS))
pileupWithDeletions = originalPileup.getOverlappingFragmentFilteredPileup(false,true);
else
throw new UserException("Must use valid CountPileupType");
return includeDeletions ? pileupWithDeletions: pileupWithDeletions.getPileupWithoutDeletions();
}
protected String calculateLineForSite(final ReadBackedPileup pileup, final String contig, final long position, final String siteID, final char refAllele, final char altAllele){
int rawDepth = 0, lowBaseQDepth = 0, lowMAPQDepth = 0, refCount = 0, altCount = 0, totalNonFilteredCount = 0, otherBasesCount = 0, improperPairsCount = 0 ;
for (final PileupElement base : pileup) {
rawDepth++;
if (!base.getRead().getProperPairFlag()){
improperPairsCount++;
continue;
}
if (base.getMappingQual() < minMappingQuality) {
lowMAPQDepth++;
continue;
}
if (base.getQual() < minBaseQuality) {
lowBaseQDepth++;
continue;
}
if(base.getBase() == refAllele)
refCount++;
else if(base.getBase() == altAllele)
altCount++;
else {
otherBasesCount++;
continue;
}
totalNonFilteredCount++;
}
if(totalNonFilteredCount < minDepthOfNonFilteredBases)
return null;
return contig +separator+
position +separator+
siteID +separator+
refAllele +separator+
altAllele +separator+
refCount +separator+
altCount +separator+
totalNonFilteredCount +separator+
lowMAPQDepth +separator+
lowBaseQDepth +separator+
rawDepth +separator+
otherBasesCount +separator+
improperPairsCount;
}
@Override
public Integer reduceInit() {
return 0;
}
@Override
public Integer reduce(String results, Integer sum) {
if(results!= null)
out.println(results);
return ++sum;
}
@Override
public void onTraversalDone(Integer sum) {
logger.info("Done processing "+sum+" loci");
out.close();
}
}

View File

@ -0,0 +1,112 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.gatk.tools.walkers.rnaseq;
import org.broadinstitute.gatk.engine.walkers.WalkerTest;
import org.testng.annotations.Test;
import java.util.Arrays;
/**
* Test all different parameters.
*/
public class ASEReadCounterIntegrationTest extends WalkerTest {
@Test
public void testASEReadCounterWithHighMQ() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T ASEReadCounter -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -sites "+privateTestDir +"NA12878.chr20_2444518_2637800.RNAseq.SYNONYMOUS_CODING.vcf -mmq 60 -o %s -U ALLOW_N_CIGAR_READS", 1,
Arrays.asList("eec421405a4a570751821d158734020e"));
executeTest("test high mq with no read passing", spec);
}
@Test
public void testASEReadCounterWithLowMQ() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T ASEReadCounter -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -sites "+privateTestDir +"NA12878.chr20_2444518_2637800.RNAseq.SYNONYMOUS_CODING.vcf -mmq 1 -o %s -U ALLOW_N_CIGAR_READS", 1,
Arrays.asList("4f1144be0bb2c4adeae00625afd04cda"));
executeTest("test high mq with no read passing", spec);
}
@Test
public void testASEReadCounterWithLowMQNoDedup() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T ASEReadCounter -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -sites "+privateTestDir +"NA12878.chr20_2444518_2637800.RNAseq.SYNONYMOUS_CODING.vcf -mmq 10 -o %s -U ALLOW_N_CIGAR_READS -drf DuplicateRead", 1,
Arrays.asList("226021673310f28d6520d7f3cfe8cb4b"));
executeTest("test high mq with no read passing", spec);
}
@Test
public void testASEReadCounterWithHighMQLowBQ() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T ASEReadCounter -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -sites "+privateTestDir +"NA12878.chr20_2444518_2637800.RNAseq.SYNONYMOUS_CODING.vcf -mmq 60 -mbq 10 -o %s -U ALLOW_N_CIGAR_READS", 1,
Arrays.asList("f86bf14ca3a2cc0114d6a11de0cd9448"));
executeTest("test high mq with no read passing", spec);
}
@Test
public void testASEReadCounterWithCountOverlaps() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T ASEReadCounter -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -sites "+privateTestDir +"NA12878.chr20_2444518_2637800.RNAseq.SYNONYMOUS_CODING.vcf -mmq 60 -mbq 10 -o %s -U ALLOW_N_CIGAR_READS -overlap COUNT_FRAGMENTS", 1,
Arrays.asList("bfaeaaa8c000eca82f703225bf2257a1"));
executeTest("test high mq with no read passing", spec);
}
@Test
public void testASEReadCounterWithCountReads() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T ASEReadCounter -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -sites "+privateTestDir +"NA12878.chr20_2444518_2637800.RNAseq.SYNONYMOUS_CODING.vcf -mmq 60 -mbq 10 -o %s -U ALLOW_N_CIGAR_READS -overlap COUNT_READS", 1,
Arrays.asList("a9b420fd12a9162b31a48842c4081a1f"));
executeTest("test high mq with no read passing", spec);
}
@Test
public void testASEReadCounterMinDepth70() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T ASEReadCounter -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -sites "+privateTestDir +"NA12878.chr20_2444518_2637800.RNAseq.SYNONYMOUS_CODING.vcf -mmq 60 -mbq 10 -o %s -U ALLOW_N_CIGAR_READS -minDepth 70", 1,
Arrays.asList("79b99bc8a1c1c58ac860f79a11f93086"));
executeTest("test high mq with no read passing", spec);
}
@Test
public void testASEReadCounterFileFormat() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T ASEReadCounter -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -sites "+privateTestDir +"NA12878.chr20_2444518_2637800.RNAseq.SYNONYMOUS_CODING.vcf -mmq 60 -mbq 10 -o %s -U ALLOW_N_CIGAR_READS --outputFormat csv", 1,
Arrays.asList("2c7c531018ab353e6874ee2da7980986"));
executeTest("test high mq with no read passing", spec);
}
}