Walkers and integration tests that calculate and compare callable bases

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3328 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-05-07 21:33:47 +00:00
parent d070554329
commit 64ccaa4c6a
6 changed files with 443 additions and 11 deletions

View File

@ -0,0 +1,190 @@
/*
* Copyright (c) 2009 The Broad Institute
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* Þles (the ÓSoftwareÓ), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, sub ject to the following
* conditions:
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED ÓAS ISÓ, WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* * OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.coverage;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.By;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.commandline.Argument;
/**
* Emits a data file containing information about callable, uncallable, poorly mapped, and other parts of the genome
*
* @Author depristo
* @Date May 7, 2010
*/
@By(DataSource.REFERENCE)
public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableBaseState, CallableLociWalker.CallableBaseState> {
@Argument(fullName = "maxLowMAPQ", shortName = "mlmq", doc = "Maximum value for MAPQ to be considered a problematic mapped read. The gap between this value and mmq are reads that are not sufficiently well mapped for calling but aren't indicative of mapping problems.", required = false)
byte maxLowMAPQ = 1;
@Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth. Defaults to 50.", required = false)
byte minMappingQuality = 10;
@Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to 20.", required = false)
byte minBaseQuality = 20;
@Argument(fullName = "minDepth", shortName = "minDepth", doc = "Minimum QC+ read depth before a locus is considered callable", required = false)
int minDepth = 4;
@Argument(fullName = "maxDepth", shortName = "maxDepth", doc = "Maximum read depth before a locus is considered poorly mapped", required = false)
int maxDepth = -1;
@Argument(fullName = "minDepthForLowMAPQ", shortName = "mdflmq", doc = "Minimum read depth before a locus is considered a potential candidate for poorly mapped", required = false)
int minDepthLowMAPQ = 10;
@Argument(fullName = "maxFractionOfReadsWithLowMAPQ", shortName = "frlmq", doc = "Maximum read depth before a locus is considered poorly mapped", required = false)
double maxLowMAPQFraction = 0.1;
@Argument(fullName = "format", shortName = "format", doc = "Output format for the system: either BED or STATE_PER_BASE", required = false)
OutputFormat outputFormat;
public enum OutputFormat { BED, STATE_PER_BASE }
////////////////////////////////////////////////////////////////////////////////////
// STANDARD WALKER METHODS
////////////////////////////////////////////////////////////////////////////////////
public boolean includeReadsWithDeletionAtLoci() { return true; }
public void initialize() {
}
// public boolean isReduceByInterval() {
// return false;
// }
public static class CallableBaseState {
public GenomeLoc loc;
public CalledState state;
public CallableBaseState(GenomeLoc loc, CalledState state) {
this.loc = loc;
this.state = state;
}
public GenomeLoc getLocation() {
return loc;
}
public CalledState getState() {
return state;
}
// update routines
public boolean changingState( CalledState newState ) {
return state != newState;
}
/**
* Updating the location of this CalledBaseState by the new stop location
* @param newStop
*/
public void update(GenomeLoc newStop) {
loc = GenomeLocParser.createGenomeLoc(loc.getContigIndex(), loc.getStart(), newStop.getStop());
}
public String toString() {
return String.format("%s %d %d %s", loc.getContig(), loc.getStart(), loc.getStop(), state);
//return String.format("%s %d %d %d %s", loc.getContig(), loc.getStart(), loc.getStop(), loc.getStop() - loc.getStart() + 1, state);
}
}
public enum CalledState { REF_N, CALLABLE, NO_COVERAGE, LOW_COVERAGE, EXCESSIVE_COVERAGE, POOR_MAPPING_QUALITY }
public CallableBaseState reduceInit() {
return null;
}
public CallableBaseState map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
CalledState state;
if ( BaseUtils.isNBase(ref.getBase()) ) {
state = CalledState.REF_N;
} else {
// count up the depths of all and QC+ bases
int rawDepth = 0, QCDepth = 0, lowMAPQDepth = 0;
for (PileupElement e : context.getBasePileup()) {
rawDepth++;
if ( e.getMappingQual() <= maxLowMAPQ )
lowMAPQDepth++;
if ( e.getMappingQual() >= minMappingQuality && ( e.getQual() >= minBaseQuality || e.isDeletion() ) ) {
QCDepth++;
}
}
//System.out.printf("%s rawdepth = %d QCDepth = %d lowMAPQ = %d%n", context.getLocation(), rawDepth, QCDepth, lowMAPQDepth);
if ( rawDepth == 0 ) {
state = CalledState.NO_COVERAGE;
} else if ( rawDepth >= minDepthLowMAPQ && MathUtils.ratio( lowMAPQDepth, rawDepth ) >= maxLowMAPQFraction ) {
state = CalledState.POOR_MAPPING_QUALITY;
} else if ( QCDepth < minDepth ) {
state = CalledState.LOW_COVERAGE;
} else if ( rawDepth >= maxDepth && maxDepth != -1 ) {
state = CalledState.EXCESSIVE_COVERAGE;
} else {
state = CalledState.CALLABLE;
}
}
return new CallableBaseState(context.getLocation(), state);
}
public CallableBaseState reduce(CallableBaseState state, CallableBaseState integrator) {
if ( outputFormat == OutputFormat.STATE_PER_BASE ) {
out.println(state.toString());
} else {
// format is integrating
if ( integrator == null ) {
integrator = state;
} else if ( state.getLocation().getStart() != integrator.getLocation().getStop() + 1 ||
integrator.changingState(state.getState()) ) {
out.println(integrator.toString());
integrator = state;
} else {
integrator.update(state.getLocation());
}
}
return integrator;
}
////////////////////////////////////////////////////////////////////////////////////
// INTERVAL ON TRAVERSAL DONE
////////////////////////////////////////////////////////////////////////////////////
public void onTraversalDone(CallableBaseState result) {
// print out the last state
if ( result!= null )
out.println(result.toString());
}
}

View File

@ -0,0 +1,129 @@
/*
* Copyright (c) 2009 The Broad Institute
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* Þles (the ÓSoftwareÓ), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, sub ject to the following
* conditions:
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED ÓAS ISÓ, WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* * OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.coverage;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.commandline.Argument;
import org.broad.tribble.bed.FullBEDFeature;
import java.util.*;
/**
* Test routine for new VariantContext object
*/
public class CompareCallableLociWalker extends RodWalker<List<CallableLociWalker.CallableBaseState>, long[][]> {
@Argument(shortName="comp1", doc="First comparison track name", required=false)
protected String COMP1 = "comp1";
@Argument(shortName="comp2", doc="First comparison track name", required=false)
protected String COMP2 = "comp2";
@Argument(shortName="printState", doc="If provided, prints sites satisfying this state pair", required=false)
protected String printState = null;
CallableLociWalker.CalledState printState1 = CallableLociWalker.CalledState.REF_N;
CallableLociWalker.CalledState printState2 = CallableLociWalker.CalledState.REF_N;
// --------------------------------------------------------------------------------------------------------------
//
// initialize
//
// --------------------------------------------------------------------------------------------------------------
public void initialize() {
if ( printState != null ) {
String[] states = printState.split(",");
printState1 = CallableLociWalker.CalledState.valueOf(states[0]);
printState2 = CallableLociWalker.CalledState.valueOf(states[1]);
}
}
// --------------------------------------------------------------------------------------------------------------
//
// map
//
// --------------------------------------------------------------------------------------------------------------
public List<CallableLociWalker.CallableBaseState> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker != null ) {
CallableLociWalker.CallableBaseState comp1 = getCallableBaseState(tracker, COMP1);
CallableLociWalker.CallableBaseState comp2 = getCallableBaseState(tracker, COMP2);
if ( printState != null && comp1.getState() == printState1 && comp2.getState() == printState2 ) {
out.printf("%s %s %s %s%n", comp1.getLocation(), comp1.getState(), comp2.getLocation(), comp2.getState());
}
return Arrays.asList(comp1, comp2);
} else {
return null;
}
}
private CallableLociWalker.CallableBaseState getCallableBaseState(RefMetaDataTracker tracker, String track) {
//System.out.printf("tracker %s%n", tracker);
List<Object> bindings = tracker.getReferenceMetaData(track);
if ( bindings.size() != 1 || ! (bindings.get(0) instanceof FullBEDFeature)) {
throw new StingException(String.format("%s track isn't a properly formated CallableBases object!", track));
}
FullBEDFeature bed = (FullBEDFeature)bindings.get(0);
GenomeLoc loc = GenomeLocParser.createGenomeLoc(bed.getChr(), bed.getStart(), bed.getEnd());
CallableLociWalker.CalledState state = CallableLociWalker.CalledState.valueOf(bed.getName());
return new CallableLociWalker.CallableBaseState(loc, state);
}
// --------------------------------------------------------------------------------------------------------------
//
// reduce
//
// --------------------------------------------------------------------------------------------------------------
public long[][] reduceInit() {
int n = CallableLociWalker.CalledState.values().length;
return new long[n][n];
}
public long[][] reduce(List<CallableLociWalker.CallableBaseState> comps, long[][] sum) {
if ( comps != null ) {
CallableLociWalker.CallableBaseState comp1 = comps.get(0);
CallableLociWalker.CallableBaseState comp2 = comps.get(1);
sum[comp1.getState().ordinal()][comp2.getState().ordinal()]++;
}
return sum;
}
public void onTraversalDone(long[][] result) {
for ( CallableLociWalker.CalledState state1 : CallableLociWalker.CalledState.values() ) {
for ( CallableLociWalker.CalledState state2 : CallableLociWalker.CalledState.values() ) {
out.printf("%s %s %s %s %d%n", COMP1, COMP2, state1, state2, result[state1.ordinal()][state2.ordinal()]);
}
}
}
}

View File

@ -19,21 +19,40 @@ public class CoverageUtils {
public enum PartitionType { BY_READ_GROUP, BY_SAMPLE }
/**
* Returns the counts of bases from reads with MAPQ > minMapQ and base quality > minBaseQ in the context
* as an array of ints, indexed by the index fields of BaseUtils
*
* @param context
* @param minMapQ
* @param minBaseQ
* @return
*/
public static int[] getBaseCounts(AlignmentContext context, int minMapQ, int minBaseQ) {
int[] counts = new int[6];
for (PileupElement e : context.getBasePileup()) {
if ( e.getMappingQual() >= minMapQ && ( e.getQual() >= minBaseQ || e.isDeletion() ) ) {
updateCounts(counts,e);
}
}
return counts;
}
public static Map<String,int[]> getBaseCountsBySample(AlignmentContext context, int minMapQ, int minBaseQ, PartitionType type) {
Map<String,int[]> samplesToCounts = new HashMap<String,int[]>();
for (PileupElement e : context.getBasePileup()) {
if ( e.getMappingQual() >= minMapQ && ( e.getQual() >= minBaseQ || e.isDeletion() ) ) {
String sample = type == PartitionType.BY_SAMPLE ? e.getRead().getReadGroup().getSample() :
e.getRead().getReadGroup().getSample()+"_rg_"+e.getRead().getReadGroup().getReadGroupId();
if ( samplesToCounts.keySet().contains(sample) ) {
updateCounts(samplesToCounts.get(sample),e);
} else {
samplesToCounts.put(sample,new int[6]);
updateCounts(samplesToCounts.get(sample),e);
}
}
for (PileupElement e : context.getBasePileup()) {
if ( e.getMappingQual() >= minMapQ && ( e.getQual() >= minBaseQ || e.isDeletion() ) ) {
String sample = type == PartitionType.BY_SAMPLE ? e.getRead().getReadGroup().getSample() :
e.getRead().getReadGroup().getSample()+"_rg_"+e.getRead().getReadGroup().getReadGroupId();
if ( ! samplesToCounts.keySet().contains(sample) )
samplesToCounts.put(sample,new int[6]);
updateCounts(samplesToCounts.get(sample),e);
}
}
return samplesToCounts;
}

View File

@ -42,6 +42,8 @@ public abstract class BaseTest {
protected static String seqLocation = "/seq/";
protected static String oneKGLocation = "/broad/1KG/";
protected static String hg18Reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta";
protected static String b36KGReference = "/broad/1KG/reference/human_b36_both.fasta";
protected static String GATKDataLocation = "/humgen/gsa-hpprojects/GATK/data/";
protected static String validationDataLocation = GATKDataLocation + "Validation_Data/";
protected static String evaluationDataLocation = GATKDataLocation + "Evaluation_Data/";

View File

@ -0,0 +1,53 @@
/*
* Copyright (c) 2009 The Broad Institute
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* Þles (the ÓSoftwareÓ), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, sub ject to the following
* conditions:
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED ÓAS ISÓ, WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* * OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.coverage;
import org.broadinstitute.sting.WalkerTest;
import org.junit.Test;
import java.util.Arrays;
public class CallableLociWalkerIntegrationTest extends WalkerTest {
final static String commonArgs = "-R " + b36KGReference + " -T CallableLoci -I " + validationDataLocation + "/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s";
@Test
public void testCallableLociWalker1() {
String gatk_args = commonArgs + " -format BED -L 1:10,000,000-11,000,000";
WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList("b3a273984da6744d6de4ca0ab3eb759b"));
executeTest("formatBed", spec);
}
@Test
public void testCallableLociWalker2() {
String gatk_args = commonArgs + " -format BED -L 1:10,000,000-10,000,100;1:10,000,110-10,000,120";
WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList("c671f65712d9575b8b3e1f1dbedc146e"));
executeTest("formatBed by interval", spec);
}
@Test
public void testCallableLociWalker3() {
String gatk_args = commonArgs + " -format BED -L 1:10,000,000-11,000,000 -minDepth 10 -maxDepth 100 --minBaseQuality 10 --minMappingQuality 20";
WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList("3fee7d7d0e305f439db29b4e641d1c20"));
executeTest("formatBed lots of arguments", spec);
}
}

View File

@ -0,0 +1,39 @@
/*
* Copyright (c) 2009 The Broad Institute
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* Þles (the ÓSoftwareÓ), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, sub ject to the following
* conditions:
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED ÓAS ISÓ, WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* * OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.coverage;
import org.broadinstitute.sting.WalkerTest;
import org.junit.Test;
import java.util.Arrays;
public class CompareCallableLociWalkerIntegrationTest extends WalkerTest {
final static String commonArgs = "-R " + hg18Reference + " -T CompareCallableLoci -B comp1,Bed," + validationDataLocation + "1kg_slx.chr1_10mb.callable -B comp2,Bed," + validationDataLocation + "ga2_slx.chr1_10mb.callable -o %s";
@Test
public void testCompareCallableLociWalker1() {
String gatk_args = commonArgs + " -L chr1:1-10,000,000";
WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList("70efebac55ff210bb022e9e22ed80a95"));
executeTest("CompareCallableLoci Walker", spec);
}
}