create 2 new tools:

- ASEReadCounter (public tool) replce Tuuli's script to produce the input to Manny's tool.
   It count the number of reads that support the ref allele and the alt allele, filtereing low qual reads and bases and keep only properPaired reads
- ASECaller (private tool) take both RNA and DNA, and produce ontingencyTables ** still under development **

minor changes in other tools:
- update RNA HC variant calling scala script
- expose FS method pValueForContingencyTable to be able to call it from ASEcaller

In ASEReadCounter:
- allow different option to deal with overlaping read from the same fragment
- add option to ignore or include indels in the pileups
- add option to disabled DuplicateRead

add ASEReadCounterIntegrationTest.java and files for the test
This commit is contained in:
Ami Levy-Moonshine 2014-08-12 15:21:58 -04:00
parent bf0c72aab0
commit c5fc5c4f8c
3 changed files with 399 additions and 1 deletions

View File

@ -183,7 +183,7 @@ public class FisherStrand extends StrandBiasTest implements StandardAnnotation,
return list;
}
private Double pValueForContingencyTable(int[][] originalTable) {
public static Double pValueForContingencyTable(int[][] originalTable) {
final int[][] normalizedTable = normalizeContingencyTable(originalTable);
int[][] table = copyContingencyTable(normalizedTable);

View File

@ -0,0 +1,286 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.gatk.tools.walkers.rnaseq;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.gatk.engine.filters.DuplicateReadFilter;
import org.broadinstitute.gatk.engine.walkers.DisabledReadFilters;
import org.broadinstitute.gatk.engine.walkers.Downsample;
import org.broadinstitute.gatk.engine.walkers.LocusWalker;
import org.broadinstitute.gatk.tools.walkers.coverage.CoverageUtils;
import org.broadinstitute.gatk.utils.commandline.*;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.downsampling.DownsampleType;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.pileup.PileupElement;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import java.io.PrintStream;
import java.util.List;
/**
* Calculated allele counts in specific loci, to allow allele specific expression analysis.
*
* <p>
* Calculated allele counts in specific loci after filtering based on mapping quality, base quality, depth of coverage, overlapping paired reads and deletion on the position.
* All thresholds and option can be set as command line arguments.
* The output of this tool is a tab delimited txt file, compatible with Mamba, a downstream tool (http://www.well.ox.ac.uk/~rivas/mamba/).
* A user can use -drf DuplicateRead to allow the tool to count also duplicated reads (a useful option for RNAseq data tools)
* </p>
*
* <h3>Input</h3>
* <p>
* BAM files (with proper headers) to be analyzed for ASE * </p>
* <p>
* a VCF file with specific sites to process
* </p>
* </p></p>
* <h3>Output</h3>
* <p>
* Table (tab delimited file) with the specific allele counts
* </p>
*
* <h3>Examples</h3>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T ASEReadCounter \
* -o file_name \
* -I input.bam \
* -sites sites.vcf
* -U ALLOW_N_CIGAR_READS
* [-minDepth 10] \
* [--minMappingQuality 10] \
* [--minBaseQuality 2]
* [-drf DuplicateRead]
* </pre>
*/
@Downsample(by = DownsampleType.BY_SAMPLE, toCoverage = 10000)
//@DisabledReadFilters({DuplicateReadFilter.class}) //currently can be disabled using the command line argument -drf DuplicateRead
public class ASEReadCounter extends LocusWalker<String, Integer> {
@Output
public PrintStream out;
@Input (fullName = "sitesVCFFile",shortName = "sites")
public RodBinding<VariantContext> sites;
/**
* Loci with total depth lower then this threshold will be skipped. This is set to -1 by default to disable the evaluation and ignore this threshold.
*/
@Argument(fullName = "minDepthOfNonFilteredBase", shortName = "minDepth", doc = "Minimum number of filtered-pass bases that we need to see for reporting this site", required = false, minValue = 0, maxValue = Integer.MAX_VALUE)
public int minDepthOfNonFilteredBases = -1;
/**
* Reads with mapping quality values lower than this threshold will be skipped. This is set to -1 by default to disable the evaluation and ignore this threshold.
*/
@Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth", required = false, minValue = 0, maxValue = Integer.MAX_VALUE)
public int minMappingQuality = 0;
/**
* Bases with quality scores lower than this threshold will be skipped. This is set to -1 by default to disable the evaluation and ignore this threshold.
*/
@Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth", required = false, minValue = 0, maxValue = Byte.MAX_VALUE)
public byte minBaseQuality = 0;
/**
* There are few option to deal with an overlapping read pair.
* COUNT_READS - Count all reads independently (even if from the same fragment).
* COUNT_FRAGMENTS - Count all fragments (even if the reads that compose the fragment are not consistent at that base).
* COUNT_FRAGMENTS_REQUIRE_SAME_BASEQUIRE_SAME_BASE - Count all fragments (but only if the reads that compose the fragment are consistent at that base).
* defaults to COUNT_FRAGMENTS_REQUIRE_SAME_BASEQUIRE_SAME_BASE
*/
@Argument(fullName = "countOverlapReadsType", shortName = "overlap", doc = "How should overlapping reads from the same fragment be handled. Current options: COUNT_READS, COUNT_FRAGMENTS and COUNT_FRAGMENTS_REQUIRE_SAME_BASEQUIRE_SAME_BASE (default)", required = false)
public CoverageUtils.CountPileupType countType = CoverageUtils.CountPileupType.COUNT_FRAGMENTS_REQUIRE_SAME_BASE;
/**
* Output file format (e.g. csv, table, rtable); defaults to r-readable table.
*/
@Argument(fullName = "outputFormat", doc = "The format of the output file, can be csv, table, rtable", required = false)
public String outputFormat = "rtable";
/**
* Consider a spanning deletion as contributing to coverage. Also enables deletion counts in per-base output.
*/
@Advanced
@Argument(fullName = "includeDeletions", shortName = "dels", doc = "Include information on deletions", required = false)
public boolean includeDeletions = false;
@Advanced
@Argument(fullName = "ignoreDeletionSites", doc = "Ignore sites consisting only of deletions", required = false)
public boolean ignoreDeletionSites = false;
final String[] OUTPUT_FORMATS = {"table","rtable","csv"};
public String separator = "\t";
////////////////////////////////////////////////////////////////////////////////////
// STANDARD WALKER METHODS
////////////////////////////////////////////////////////////////////////////////////
public boolean includeReadsWithDeletionAtLoci() { return includeDeletions && ! ignoreDeletionSites; }
public void initialize() {
// Check the output format
boolean goodOutputFormat = false;
for ( final String f : OUTPUT_FORMATS ) {
goodOutputFormat = goodOutputFormat || f.equals(outputFormat);
}
if ( ! goodOutputFormat ) {
throw new IllegalArgumentException("Improper output format. Can be one of table,rtable,csv. Was "+outputFormat);
}
if ( outputFormat.equals("csv") ) {
separator = ",";
}
final String header = "contig"+separator+"position"+separator+"variantID"+separator+"refAllele"+separator+"altAllele"+separator+"refCount"+separator+"altCount"+separator+"totalCount"+separator+"lowMAPQDepth"+separator+"lowBaseQDepth"+separator+"rawDepth"+separator+"otherBases"+separator+"improperPairs";
out.println(header);
}
@Override
public String map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
if ( tracker == null )
return null;
final String contig = context.getLocation().getContig();
final long position = context.getPosition();
final char refAllele = (char)ref.getBase();
final List<VariantContext> VCs = tracker.getValues(sites, context.getLocation());
if(VCs != null && VCs.size() > 1)
throw new UserException("more then one variant context in position: "+contig+":"+position);
if(VCs == null || VCs.size() == 0)
return null;
final VariantContext vc = VCs.get(0);
if(!vc.isBiallelic()) {
logger.warn("ignore site: cannot run ASE on non-biallelic sites: " + vc.toString());
return null;
}
final char altAllele = (char)vc.getAlternateAllele(0).getBases()[0];
final String siteID = vc.getID();
final ReadBackedPileup pileup = filterPileup(context.getBasePileup(), countType, includeReadsWithDeletionAtLoci());
// count up the depths of all and QC+ bases
return calculateLineForSite(pileup, contig, position, siteID, refAllele, altAllele);
}
protected ReadBackedPileup filterPileup(final ReadBackedPileup originalPileup, final CoverageUtils.CountPileupType countType, final boolean includeDeletions){
ReadBackedPileup pileupWithDeletions;
if(countType.equals(CoverageUtils.CountPileupType.COUNT_FRAGMENTS_REQUIRE_SAME_BASE))
pileupWithDeletions = originalPileup.getOverlappingFragmentFilteredPileup(true,true);
else if(countType.equals(CoverageUtils.CountPileupType.COUNT_READS))
pileupWithDeletions = originalPileup;
else if(countType.equals(CoverageUtils.CountPileupType.COUNT_FRAGMENTS))
pileupWithDeletions = originalPileup.getOverlappingFragmentFilteredPileup(false,true);
else
throw new UserException("Must use valid CountPileupType");
return includeDeletions ? pileupWithDeletions: pileupWithDeletions.getPileupWithoutDeletions();
}
protected String calculateLineForSite(final ReadBackedPileup pileup, final String contig, final long position, final String siteID, final char refAllele, final char altAllele){
int rawDepth = 0, lowBaseQDepth = 0, lowMAPQDepth = 0, refCount = 0, altCount = 0, totalNonFilteredCount = 0, otherBasesCount = 0, improperPairsCount = 0 ;
for (final PileupElement base : pileup) {
rawDepth++;
if (!base.getRead().getProperPairFlag()){
improperPairsCount++;
continue;
}
if (base.getMappingQual() < minMappingQuality) {
lowMAPQDepth++;
continue;
}
if (base.getQual() < minBaseQuality) {
lowBaseQDepth++;
continue;
}
if(base.getBase() == refAllele)
refCount++;
else if(base.getBase() == altAllele)
altCount++;
else {
otherBasesCount++;
continue;
}
totalNonFilteredCount++;
}
if(totalNonFilteredCount < minDepthOfNonFilteredBases)
return null;
return contig +separator+
position +separator+
siteID +separator+
refAllele +separator+
altAllele +separator+
refCount +separator+
altCount +separator+
totalNonFilteredCount +separator+
lowMAPQDepth +separator+
lowBaseQDepth +separator+
rawDepth +separator+
otherBasesCount +separator+
improperPairsCount;
}
@Override
public Integer reduceInit() {
return 0;
}
@Override
public Integer reduce(String results, Integer sum) {
if(results!= null)
out.println(results);
return ++sum;
}
@Override
public void onTraversalDone(Integer sum) {
logger.info("Done processing "+sum+" loci");
out.close();
}
}

View File

@ -0,0 +1,112 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.gatk.tools.walkers.rnaseq;
import org.broadinstitute.gatk.engine.walkers.WalkerTest;
import org.testng.annotations.Test;
import java.util.Arrays;
/**
* Test all different parameters.
*/
public class ASEReadCounterIntegrationTest extends WalkerTest {
@Test
public void testASEReadCounterWithHighMQ() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T ASEReadCounter -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -sites "+privateTestDir +"NA12878.chr20_2444518_2637800.RNAseq.SYNONYMOUS_CODING.vcf -mmq 60 -o %s -U ALLOW_N_CIGAR_READS", 1,
Arrays.asList("eec421405a4a570751821d158734020e"));
executeTest("test high mq with no read passing", spec);
}
@Test
public void testASEReadCounterWithLowMQ() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T ASEReadCounter -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -sites "+privateTestDir +"NA12878.chr20_2444518_2637800.RNAseq.SYNONYMOUS_CODING.vcf -mmq 1 -o %s -U ALLOW_N_CIGAR_READS", 1,
Arrays.asList("4f1144be0bb2c4adeae00625afd04cda"));
executeTest("test high mq with no read passing", spec);
}
@Test
public void testASEReadCounterWithLowMQNoDedup() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T ASEReadCounter -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -sites "+privateTestDir +"NA12878.chr20_2444518_2637800.RNAseq.SYNONYMOUS_CODING.vcf -mmq 10 -o %s -U ALLOW_N_CIGAR_READS -drf DuplicateRead", 1,
Arrays.asList("226021673310f28d6520d7f3cfe8cb4b"));
executeTest("test high mq with no read passing", spec);
}
@Test
public void testASEReadCounterWithHighMQLowBQ() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T ASEReadCounter -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -sites "+privateTestDir +"NA12878.chr20_2444518_2637800.RNAseq.SYNONYMOUS_CODING.vcf -mmq 60 -mbq 10 -o %s -U ALLOW_N_CIGAR_READS", 1,
Arrays.asList("f86bf14ca3a2cc0114d6a11de0cd9448"));
executeTest("test high mq with no read passing", spec);
}
@Test
public void testASEReadCounterWithCountOverlaps() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T ASEReadCounter -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -sites "+privateTestDir +"NA12878.chr20_2444518_2637800.RNAseq.SYNONYMOUS_CODING.vcf -mmq 60 -mbq 10 -o %s -U ALLOW_N_CIGAR_READS -overlap COUNT_FRAGMENTS", 1,
Arrays.asList("bfaeaaa8c000eca82f703225bf2257a1"));
executeTest("test high mq with no read passing", spec);
}
@Test
public void testASEReadCounterWithCountReads() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T ASEReadCounter -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -sites "+privateTestDir +"NA12878.chr20_2444518_2637800.RNAseq.SYNONYMOUS_CODING.vcf -mmq 60 -mbq 10 -o %s -U ALLOW_N_CIGAR_READS -overlap COUNT_READS", 1,
Arrays.asList("a9b420fd12a9162b31a48842c4081a1f"));
executeTest("test high mq with no read passing", spec);
}
@Test
public void testASEReadCounterMinDepth70() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T ASEReadCounter -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -sites "+privateTestDir +"NA12878.chr20_2444518_2637800.RNAseq.SYNONYMOUS_CODING.vcf -mmq 60 -mbq 10 -o %s -U ALLOW_N_CIGAR_READS -minDepth 70", 1,
Arrays.asList("79b99bc8a1c1c58ac860f79a11f93086"));
executeTest("test high mq with no read passing", spec);
}
@Test
public void testASEReadCounterFileFormat() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T ASEReadCounter -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -sites "+privateTestDir +"NA12878.chr20_2444518_2637800.RNAseq.SYNONYMOUS_CODING.vcf -mmq 60 -mbq 10 -o %s -U ALLOW_N_CIGAR_READS --outputFormat csv", 1,
Arrays.asList("2c7c531018ab353e6874ee2da7980986"));
executeTest("test high mq with no read passing", spec);
}
}