Separated out the DoC calculations from the XHMM pipeline, so that CalcDepthOfCoverage can be used for calculating joint coverage on a per-base accounting over multiple samples (e.g., family samples)

This commit is contained in:
Menachem Fromer 2012-09-10 15:08:44 -04:00
commit 449b89bd34
218 changed files with 10021 additions and 3540 deletions

7
.gitignore vendored
View File

@ -18,3 +18,10 @@ queueScatterGather
/bar*
integrationtests/
public/testdata/onTheFlyOutputTest.vcf
private/testdata/onTheFlyOutputTest.vcf
lib
html
gatkdocs
dist
build
resources

View File

@ -577,6 +577,7 @@
docletpathref="doclet.classpath"
classpathref="external.dependencies"
classpath="${java.classes}"
maxmemory="2g"
additionalparam="-build-timestamp "${build.timestamp}" -absolute-version ${build.version} -out ${basedir}/${resource.path} -quiet">
<sourcefiles>
<union>
@ -780,6 +781,7 @@
docletpathref="doclet.classpath"
classpathref="external.dependencies"
classpath="${java.classes}"
maxmemory="2g"
additionalparam="${gatkdocs.include.hidden.arg} -private -build-timestamp &quot;${build.timestamp}&quot; -absolute-version ${build.version} -quiet"> <!-- -test to only do DocumentationTest walker -->
<sourcefiles>
<fileset refid="java.source.files"/>

View File

@ -34,17 +34,20 @@ import org.broadinstitute.sting.utils.recalibration.EventType;
import org.broadinstitute.sting.utils.recalibration.ReadCovariates;
import org.broadinstitute.sting.utils.recalibration.RecalDatum;
import org.broadinstitute.sting.utils.recalibration.RecalibrationTables;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine implements ProtectedPackageSource {
// optimizations: don't reallocate an array each time
private byte[] tempQualArray;
private boolean[] tempErrorArray;
private double[] tempFractionalErrorArray;
public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables) {
super.initialize(covariates, recalibrationTables);
tempQualArray = new byte[EventType.values().length];
tempErrorArray = new boolean[EventType.values().length];
tempFractionalErrorArray = new double[EventType.values().length];
}
/**
@ -56,6 +59,7 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp
* @param pileupElement The pileup element to update
* @param refBase The reference base at this locus
*/
@Override
public synchronized void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase) {
final int offset = pileupElement.getOffset();
final ReadCovariates readCovariates = covariateKeySetFrom(pileupElement.getRead());
@ -76,7 +80,7 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp
final NestedIntegerArray<RecalDatum> rgRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE);
final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex);
final RecalDatum rgThisDatum = createDatumObject(qual, isError);
if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it
if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it
rgRecalTable.put(rgThisDatum, keys[0], eventIndex);
else
rgPreviousDatum.combine(rgThisDatum);
@ -100,4 +104,53 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp
}
}
}
@Override
public synchronized void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) {
for( int offset = 0; offset < read.getReadBases().length; offset++ ) {
if( !skip[offset] ) {
final ReadCovariates readCovariates = covariateKeySetFrom(read);
tempQualArray[EventType.BASE_SUBSTITUTION.index] = read.getBaseQualities()[offset];
tempFractionalErrorArray[EventType.BASE_SUBSTITUTION.index] = snpErrors[offset];
tempQualArray[EventType.BASE_INSERTION.index] = read.getBaseInsertionQualities()[offset];
tempFractionalErrorArray[EventType.BASE_INSERTION.index] = insertionErrors[offset];
tempQualArray[EventType.BASE_DELETION.index] = read.getBaseDeletionQualities()[offset];
tempFractionalErrorArray[EventType.BASE_DELETION.index] = deletionErrors[offset];
for (final EventType eventType : EventType.values()) {
final int[] keys = readCovariates.getKeySet(offset, eventType);
final int eventIndex = eventType.index;
final byte qual = tempQualArray[eventIndex];
final double isError = tempFractionalErrorArray[eventIndex];
final NestedIntegerArray<RecalDatum> rgRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE);
final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex);
final RecalDatum rgThisDatum = createDatumObject(qual, isError);
if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it
rgRecalTable.put(rgThisDatum, keys[0], eventIndex);
else
rgPreviousDatum.combine(rgThisDatum);
final NestedIntegerArray<RecalDatum> qualRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE);
final RecalDatum qualPreviousDatum = qualRecalTable.get(keys[0], keys[1], eventIndex);
if (qualPreviousDatum == null)
qualRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], eventIndex);
else
qualPreviousDatum.increment(1.0, isError);
for (int i = 2; i < covariates.length; i++) {
if (keys[i] < 0)
continue;
final NestedIntegerArray<RecalDatum> covRecalTable = recalibrationTables.getTable(i);
final RecalDatum covPreviousDatum = covRecalTable.get(keys[0], keys[1], keys[i], eventIndex);
if (covPreviousDatum == null)
covRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], keys[i], eventIndex);
else
covPreviousDatum.increment(1.0, isError);
}
}
}
}
}
}

View File

@ -34,7 +34,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.*;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.PartitionBy;
import org.broadinstitute.sting.gatk.walkers.PartitionType;
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
@ -247,7 +247,7 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
* @return a linked list with all the reads produced by the clipping operations
*/
@Override
public LinkedList<GATKSAMRecord> map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
public LinkedList<GATKSAMRecord> map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) {
LinkedList<GATKSAMRecord> mappedReads;
totalReads++;
if (!debugRead.isEmpty() && read.getReadName().contains(debugRead))

View File

@ -546,7 +546,7 @@ public class SlidingWindow {
FractionalDownsampler <GATKSAMRecord> downsampler = new FractionalDownsampler<GATKSAMRecord>(fraction);
downsampler.submit(allReads);
return downsampler.consumeDownsampledItems();
return downsampler.consumeFinalizedItems();
}

View File

@ -52,7 +52,11 @@ public class GenotypingEngine {
noCall.add(Allele.NO_CALL);
}
// This function is the streamlined approach, currently not being used
// WARN
// This function is the streamlined approach, currently not being used by default
// WARN
// WARN: This function is currently only being used by Menachem. Slated for removal/merging with the rest of the code.
// WARN
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine,
final ArrayList<Haplotype> haplotypes,
@ -184,6 +188,7 @@ public class GenotypingEngine {
return returnCalls;
}
// BUGBUG: Create a class to hold this complicated return type
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine,
final ArrayList<Haplotype> haplotypes,
@ -210,13 +215,8 @@ public class GenotypingEngine {
System.out.println( ">> Events = " + h.getEventMap());
}
}
// Create the VC merge priority list
final ArrayList<String> priorityList = new ArrayList<String>();
for( int iii = 0; iii < haplotypes.size(); iii++ ) {
priorityList.add("HC" + iii);
}
cleanUpSymbolicUnassembledEvents( haplotypes, priorityList );
cleanUpSymbolicUnassembledEvents( haplotypes );
if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 3 ) { // if not in GGA mode and have at least 3 samples try to create MNP and complex events by looking at LD structure
mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc );
}
@ -229,13 +229,16 @@ public class GenotypingEngine {
// Walk along each position in the key set and create each event to be outputted
for( final int loc : startPosKeySet ) {
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) {
final ArrayList<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>();
final ArrayList<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>(); // the overlapping events to merge into a common reference view
final ArrayList<String> priorityList = new ArrayList<String>(); // used to merge overlapping events into common reference view
if( activeAllelesToGenotype.isEmpty() ) {
for( final Haplotype h : haplotypes ) {
final HashMap<Integer,VariantContext> eventMap = h.getEventMap();
final VariantContext vc = eventMap.get(loc);
if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) {
eventsAtThisLoc.add(vc);
priorityList.add(vc.getSource());
}
}
} else { // we are in GGA mode!
@ -260,6 +263,22 @@ public class GenotypingEngine {
// Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event
final ArrayList<ArrayList<Haplotype>> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
// Sanity check the priority list
for( final VariantContext vc : eventsAtThisLoc ) {
if( !priorityList.contains(vc.getSource()) ) {
throw new ReviewedStingException("Event found on haplotype that wasn't added to priority list. Something went wrong in the merging of alleles.");
}
}
for( final String name : priorityList ) {
boolean found = false;
for( final VariantContext vc : eventsAtThisLoc ) {
if(vc.getSource().equals(name)) { found = true; break; }
}
if( !found ) {
throw new ReviewedStingException("Event added to priority list but wasn't found on any haplotype. Something went wrong in the merging of alleles.");
}
}
// Merge the event to find a common reference representation
final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
if( mergedVC == null ) { continue; }
@ -299,9 +318,8 @@ public class GenotypingEngine {
return returnCalls;
}
protected static void cleanUpSymbolicUnassembledEvents( final ArrayList<Haplotype> haplotypes, final ArrayList<String> priorityList ) {
protected static void cleanUpSymbolicUnassembledEvents( final ArrayList<Haplotype> haplotypes ) {
final ArrayList<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
final ArrayList<String> stringsToRemove = new ArrayList<String>();
for( final Haplotype h : haplotypes ) {
for( final VariantContext vc : h.getEventMap().values() ) {
if( vc.isSymbolic() ) {
@ -309,7 +327,6 @@ public class GenotypingEngine {
for( final VariantContext vc2 : h2.getEventMap().values() ) {
if( vc.getStart() == vc2.getStart() && vc2.isIndel() ) {
haplotypesToRemove.add(h);
stringsToRemove.add(vc.getSource());
break;
}
}
@ -318,7 +335,6 @@ public class GenotypingEngine {
}
}
haplotypes.removeAll(haplotypesToRemove);
priorityList.removeAll(stringsToRemove);
}
protected void mergeConsecutiveEventsBasedOnLD( final ArrayList<Haplotype> haplotypes, final TreeSet<Integer> startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) {

View File

@ -27,26 +27,23 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
import org.broadinstitute.sting.gatk.walkers.genotyper.*;
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
import org.broadinstitute.sting.gatk.walkers.PartitionBy;
import org.broadinstitute.sting.gatk.walkers.PartitionType;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.sting.gatk.walkers.genotyper.*;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.collections.Pair;
@ -54,6 +51,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
import org.broadinstitute.sting.utils.fragments.FragmentUtils;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -103,6 +101,7 @@ import java.util.*;
@DocumentedGATKFeature( groupName = "Variant Discovery Tools", extraDocs = {CommandLineGATK.class} )
@PartitionBy(PartitionType.LOCUS)
@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN)
@ActiveRegionExtension(extension=65, maxRegion=300)
public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implements AnnotatorCompatible {
@ -309,7 +308,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
for( final VariantContext vc : tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()) ) {
if( !allelesToGenotype.contains(vc) ) {
allelesToGenotype.add(vc); // save for later for processing during the ActiveRegion's map call. Should be folded into a ReadMetaDataTracker object
allelesToGenotype.add(vc); // save for later for processing during the ActiveRegion's map call. Should be folded into a RefMetaDataTracker object
}
}
if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) {

View File

@ -127,9 +127,9 @@ public class BQSRIntegrationTest extends WalkerTest {
@DataProvider(name = "PRTest")
public Object[][] createPRTestData() {
return new Object[][]{
{new PRTest("", "d2d6ed8667cdba7e56f5db97d6262676")},
{new PRTest(" -qq -1", "b7053d3d67aba6d8892f0a60f0ded338")},
{new PRTest(" -qq 6", "bfbf0855185b2b70aa35237fb71e4487")},
{new PRTest("", "1532242f9fe90ef759a0faa5d85f61fb")},
{new PRTest(" -qq -1", "3dd2c87915c96ac55c3872026574d8cb")},
{new PRTest(" -qq 6", "5d012ee224f1cb4a7afac59e3655e20c")},
{new PRTest(" -DIQ", "66aa65223f192ee39c1773aa187fd493")}
};
}

View File

@ -47,12 +47,12 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest {
@Test(enabled = true)
public void testBOTH_GGA_Pools() {
PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","077db83cf7dc5490f670c85856b408b2");
PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","0ff90fa3882a3fb5089a7bba50dd8ae3");
}
@Test(enabled = true)
public void testINDEL_GGA_Pools() {
PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","e460a17377b731ff4eab36fb56042ecd");
PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","90af837f372e3d5143af30bf5c8c2b75");
}
@Test(enabled = true)
@ -67,11 +67,11 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest {
@Test(enabled = true)
public void testMT_SNP_DISCOVERY_sp4() {
PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","da359fe7dd6dce045193198c264301ee");
PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","c32e10070e10d30d33e5b882c1f89413");
}
@Test(enabled = true)
public void testMT_SNP_GGA_sp10() {
PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "ad0eef3a9deaa098d79df62af7e5448a");
PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "4d16d3c9475637bad70e9dc2eafe2da2");
}
}

View File

@ -66,4 +66,11 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "e1f88fac91424740c0eaac1de48b3970");
}
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("000fd36d5cf8090386bb2ac15e3ab0b5"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}
}

View File

@ -1,5 +1,6 @@
library(gplots)
library(ggplot2)
library(tools)
# -------------------------------------------------------
# Utilities for displaying multiple plots per page
@ -59,6 +60,7 @@ closePDF <- function(outputPDF) {
if ( ! is.na(outputPDF) ) {
dev.off()
if (exists("compactPDF")) {
print("compacting PDF")
compactPDF(outputPDF)
}
}

View File

@ -245,7 +245,7 @@ public class FastaSequenceIndexBuilder {
* Reset iterators and add contig to sequence index
*/
private void finishReadingContig(FastaSequenceIndex sequenceIndex) {
sequenceIndex.add(new FastaSequenceIndexEntry(contig, location, size, (int) basesPerLine, (int) bytesPerLine, thisSequenceIndex++));
sequenceIndex.add(new FastaSequenceIndexEntry(trimContigName(contig), location, size, (int) basesPerLine, (int) bytesPerLine, thisSequenceIndex++));
status = Status.NONE;
contig = "";
size = 0;
@ -258,6 +258,14 @@ public class FastaSequenceIndexBuilder {
}
}
/*
* Trims the contig name to the expected value by removing any characters after the first whitespace
*/
private static String trimContigName(final String contigName) {
int whitespaceIndex = contigName.indexOf(' ');
return ( whitespaceIndex == -1 ) ? contigName : contigName.substring(0, whitespaceIndex);
}
/**
* Stores FastaSequenceIndex as a .fasta.fai file on local machine
* Although method is public it cannot be called on any old FastaSequenceIndex - must be created by a FastaSequenceIndexBuilder

View File

@ -31,7 +31,7 @@ import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -81,7 +81,7 @@ public class AlignmentValidation extends ReadWalker<Integer,Integer> {
* @return Number of reads aligned by this map (aka 1).
*/
@Override
public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) {
//logger.info(String.format("examining read %s", read.getReadName()));
byte[] bases = read.getReadBases();

View File

@ -1,139 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.alignment;
import net.sf.picard.reference.ReferenceSequenceFileFactory;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceDictionary;
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.io.File;
/**
* Aligns reads to a given reference using Heng Li's BWA aligner, presenting the resulting alignments in SAM or BAM format.
* Mimics the steps 'bwa aln' followed by 'bwa samse' using the BWA/C implementation.
*
* @author mhanna
* @version 0.1
*/
@DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} )
@WalkerName("Align")
public class AlignmentWalker extends ReadWalker<Integer,Integer> {
@Argument(fullName="target_reference",shortName="target_ref",doc="The reference to which reads in the source file should be aligned. Alongside this reference should sit index files " +
"generated by bwa index -d bwtsw. If unspecified, will default " +
"to the reference specified via the -R argument.",required=false)
private File targetReferenceFile = null;
@Output
private StingSAMFileWriter out = null;
/**
* The actual aligner.
*/
private BWACAligner aligner = null;
/**
* New header to use, if desired.
*/
private SAMFileHeader header;
/**
* Create an aligner object. The aligner object will load and hold the BWT until close() is called.
*/
@Override
public void initialize() {
if(targetReferenceFile == null)
targetReferenceFile = getToolkit().getArguments().referenceFile;
BWTFiles bwtFiles = new BWTFiles(targetReferenceFile.getAbsolutePath());
BWAConfiguration configuration = new BWAConfiguration();
aligner = new BWACAligner(bwtFiles,configuration);
// Take the header of the SAM file, tweak it by adding in the reference dictionary and specifying that the target file is unsorted.
header = getToolkit().getSAMFileHeader().clone();
SAMSequenceDictionary referenceDictionary =
ReferenceSequenceFileFactory.getReferenceSequenceFile(targetReferenceFile).getSequenceDictionary();
header.setSequenceDictionary(referenceDictionary);
header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
out.writeHeader(header);
}
/**
* Aligns a read to the given reference.
*
* @param ref Reference over the read. Read will most likely be unmapped, so ref will be null.
* @param read Read to align.
* @return Number of alignments found for this read.
*/
@Override
public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
SAMRecord alignedRead = aligner.align(read,header);
out.addAlignment(alignedRead);
return 1;
}
/**
* Initial value for reduce. In this case, alignments will be counted.
* @return 0, indicating no alignments yet found.
*/
@Override
public Integer reduceInit() { return 0; }
/**
* Calculates the number of alignments found.
* @param value Number of alignments found by this map.
* @param sum Number of alignments found before this map.
* @return Number of alignments found up to and including this map.
*/
@Override
public Integer reduce(Integer value, Integer sum) {
return value + sum;
}
/**
* Cleanup.
* @param result Number of reads processed.
*/
@Override
public void onTraversalDone(Integer result) {
aligner.close();
super.onTraversalDone(result);
}
}

View File

@ -1,132 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.alignment;
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.io.PrintStream;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
/**
* Counts the number of best alignments as presented by BWA and outputs a histogram of number of placements vs. the
* frequency of that number of placements.
*
* @author mhanna
* @version 0.1
*/
@DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} )
public class CountBestAlignments extends ReadWalker<Integer,Integer> {
/**
* The supporting BWT index generated using BWT.
*/
@Argument(fullName="BWTPrefix",shortName="BWT",doc="Index files generated by bwa index -d bwtsw",required=false)
private String prefix = null;
@Output
private PrintStream out = null;
/**
* The actual aligner.
*/
private Aligner aligner = null;
private SortedMap<Integer,Integer> alignmentFrequencies = new TreeMap<Integer,Integer>();
/**
* Create an aligner object. The aligner object will load and hold the BWT until close() is called.
*/
@Override
public void initialize() {
if(prefix == null)
prefix = getToolkit().getArguments().referenceFile.getAbsolutePath();
BWTFiles bwtFiles = new BWTFiles(prefix);
BWAConfiguration configuration = new BWAConfiguration();
aligner = new BWACAligner(bwtFiles,configuration);
}
/**
* Aligns a read to the given reference.
*
* @param ref Reference over the read. Read will most likely be unmapped, so ref will be null.
* @param read Read to align.
* @return Number of alignments found for this read.
*/
@Override
public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
Iterator<Alignment[]> alignmentIterator = aligner.getAllAlignments(read.getReadBases()).iterator();
if(alignmentIterator.hasNext()) {
int numAlignments = alignmentIterator.next().length;
if(alignmentFrequencies.containsKey(numAlignments))
alignmentFrequencies.put(numAlignments,alignmentFrequencies.get(numAlignments)+1);
else
alignmentFrequencies.put(numAlignments,1);
}
return 1;
}
/**
* Initial value for reduce. In this case, validated reads will be counted.
* @return 0, indicating no reads yet validated.
*/
@Override
public Integer reduceInit() { return 0; }
/**
* Calculates the number of reads processed.
* @param value Number of reads processed by this map.
* @param sum Number of reads processed before this map.
* @return Number of reads processed up to and including this map.
*/
@Override
public Integer reduce(Integer value, Integer sum) {
return value + sum;
}
/**
* Cleanup.
* @param result Number of reads processed.
*/
@Override
public void onTraversalDone(Integer result) {
aligner.close();
for(Map.Entry<Integer,Integer> alignmentFrequency: alignmentFrequencies.entrySet())
out.printf("%d\t%d%n", alignmentFrequency.getKey(), alignmentFrequency.getValue());
super.onTraversalDone(result);
}
}

View File

@ -117,6 +117,15 @@ public final class RodBinding<T extends Feature> {
this.bound = true;
}
/**
* For testing purposes only. Creates a RodBinding sufficient for looking up associations to rawName
* @param type
* @param rawName
*/
public RodBinding(Class<T> type, final String rawName) {
this(type, rawName, "missing", type.getSimpleName(), new Tags());
}
/**
* Make an unbound RodBinding<T>. Only available for creating the globally unique UNBOUND object
* @param type class this unbound RodBinding creates

View File

@ -112,31 +112,35 @@ public class CommandLineGATK extends CommandLineExecutable {
}
}
protected static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file";
protected static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files";
public static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file";
public static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files";
private static void checkForMaskedUserErrors(final Throwable t) {
final String message = t.getMessage();
if ( message == null )
return;
// we know what to do about the common "Too many open files" error
if ( message.indexOf("Too many open files") != -1 )
if ( message.contains("Too many open files") )
exitSystemWithUserError(new UserException.TooManyOpenFiles());
// malformed BAM looks like a SAM file
if ( message.indexOf(PICARD_TEXT_SAM_FILE_ERROR_1) != -1 ||
message.indexOf(PICARD_TEXT_SAM_FILE_ERROR_2) != -1 )
if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) ||
message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) )
exitSystemWithSamError(t);
// can't close tribble index when writing
if ( message.indexOf("Unable to close index for") != -1 )
if ( message.contains("Unable to close index for") )
exitSystemWithUserError(new UserException(t.getCause() == null ? message : t.getCause().getMessage()));
// disk is full
if ( message.indexOf("No space left on device") != -1 )
exitSystemWithUserError(new UserException(t.getMessage()));
if ( t.getCause() != null && t.getCause().getMessage().indexOf("No space left on device") != -1 )
exitSystemWithUserError(new UserException(t.getCause().getMessage()));
if ( message.contains("No space left on device") )
exitSystemWithUserError(new UserException.NoSpaceOnDevice());
if ( t.getCause() != null && t.getCause().getMessage().contains("No space left on device") )
exitSystemWithUserError(new UserException.NoSpaceOnDevice());
// masked out of memory error
if ( t.getCause() != null && t.getCause() instanceof OutOfMemoryError )
exitSystemWithUserError(new UserException.NotEnoughMemory());
}
/**

View File

@ -1,52 +0,0 @@
package org.broadinstitute.sting.gatk;
import org.broadinstitute.sting.utils.exceptions.UserException;
/**
* Describes the method for downsampling reads at a given locus.
*
* @author hanna
* @version 0.1
*/
public class DownsamplingMethod {
/**
* Type of downsampling to perform.
*/
public final DownsampleType type;
/**
* Actual downsampling target is specified as an integer number of reads.
*/
public final Integer toCoverage;
/**
* Actual downsampling target is specified as a fraction of total available reads.
*/
public final Double toFraction;
/**
* Expresses no downsampling applied at all.
*/
public static final DownsamplingMethod NONE = new DownsamplingMethod(DownsampleType.NONE,null,null);
public DownsamplingMethod(DownsampleType type, Integer toCoverage, Double toFraction) {
// Do some basic sanity checks on the downsampling parameters passed in.
// Can't leave toFraction and toCoverage null unless type is experimental naive duplicate eliminator.
if(type != DownsampleType.NONE && toFraction == null && toCoverage == null)
throw new UserException.CommandLineException("Must specify either toFraction or toCoverage when downsampling.");
// Fraction and coverage cannot both be specified.
if(toFraction != null && toCoverage != null)
throw new UserException.CommandLineException("Downsampling coverage and fraction are both specified. Please choose only one.");
// Experimental by sample downsampling does not work with a fraction of reads.
if(type == DownsampleType.BY_SAMPLE && toFraction != null)
throw new UserException.CommandLineException("Cannot downsample to fraction with new EXPERIMENTAL_BY_SAMPLE method");
this.type = type;
this.toCoverage = toCoverage;
this.toFraction = toFraction;
}
}

View File

@ -30,19 +30,21 @@ import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceDictionary;
import org.apache.log4j.Logger;
import org.broad.tribble.readers.PositionalBufferedStream;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.reads.*;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
import org.broadinstitute.sting.gatk.filters.FilterManager;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter;
import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.gatk.io.stubs.Stub;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
@ -50,20 +52,16 @@ import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.gatk.samples.SampleDBBuilder;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.classloader.GATKLiteUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.*;
/**
@ -136,11 +134,18 @@ public class GenomeAnalysisEngine {
*/
private Collection<ReadFilter> filters;
/**
* Collection of the read transformers applied to the reads
*/
private List<ReadTransformer> readTransformers;
/**
* Controls the allocation of threads between CPU vs IO.
*/
private ThreadAllocation threadAllocation;
private ReadMetrics cumulativeMetrics = null;
/**
* A currently hacky unique name for this GATK instance
*/
@ -175,6 +180,13 @@ public class GenomeAnalysisEngine {
*/
private Collection<RMDTriplet> referenceMetaDataFiles;
/**
* The threading efficiency monitor we use in the GATK to monitor our efficiency.
*
* May be null if one isn't active, or hasn't be initialized yet
*/
private ThreadEfficiencyMonitor threadEfficiencyMonitor = null;
/**
* Set the reference metadata files to use for this traversal.
* @param referenceMetaDataFiles Collection of files and descriptors over which to traverse.
@ -252,6 +264,7 @@ public class GenomeAnalysisEngine {
// our microscheduler, which is in charge of running everything
MicroScheduler microScheduler = createMicroscheduler();
threadEfficiencyMonitor = microScheduler.getThreadEfficiencyMonitor();
// create temp directories as necessary
initializeTempDirectory();
@ -280,6 +293,8 @@ public class GenomeAnalysisEngine {
static {
deprecatedGATKWalkers.put("CountCovariates", "2.0");
deprecatedGATKWalkers.put("TableRecalibration", "2.0");
deprecatedGATKWalkers.put("AlignmentWalker", "2.2");
deprecatedGATKWalkers.put("CountBestAlignments", "2.2");
}
/**
@ -349,32 +364,59 @@ public class GenomeAnalysisEngine {
return Collections.unmodifiableList(filters);
}
/**
* Returns a list of active, initialized read transformers
*
* @param walker the walker we need to apply read transformers too
* @return a non-null list of read transformers
*/
public void initializeReadTransformers(final Walker walker) {
final List<ReadTransformer> activeTransformers = new ArrayList<ReadTransformer>();
final ReadTransformersMode overrideMode = WalkerManager.getWalkerAnnotation(walker, ReadTransformersMode.class);
final ReadTransformer.ApplicationTime overrideTime = overrideMode != null ? overrideMode.ApplicationTime() : null;
final PluginManager<ReadTransformer> pluginManager = new PluginManager<ReadTransformer>(ReadTransformer.class);
for ( final ReadTransformer transformer : pluginManager.createAllTypes() ) {
transformer.initialize(overrideTime, this, walker);
if ( transformer.enabled() )
activeTransformers.add(transformer);
}
setReadTransformers(activeTransformers);
}
public List<ReadTransformer> getReadTransformers() {
return readTransformers;
}
private void setReadTransformers(final List<ReadTransformer> readTransformers) {
if ( readTransformers == null )
throw new ReviewedStingException("read transformers cannot be null");
this.readTransformers = readTransformers;
}
/**
* Parse out the thread allocation from the given command-line argument.
*/
private void determineThreadAllocation() {
Tags tags = parsingEngine.getTags(argCollection.numberOfThreads);
if ( argCollection.numberOfDataThreads < 1 ) throw new UserException.BadArgumentValue("num_threads", "cannot be less than 1, but saw " + argCollection.numberOfDataThreads);
if ( argCollection.numberOfCPUThreadsPerDataThread < 1 ) throw new UserException.BadArgumentValue("num_cpu_threads", "cannot be less than 1, but saw " + argCollection.numberOfCPUThreadsPerDataThread);
if ( argCollection.numberOfIOThreads < 0 ) throw new UserException.BadArgumentValue("num_io_threads", "cannot be less than 0, but saw " + argCollection.numberOfIOThreads);
// TODO: Kill this complicated logic once Queue supports arbitrary tagged parameters.
Integer numCPUThreads = null;
if(tags.containsKey("cpu") && argCollection.numberOfCPUThreads != null)
throw new UserException("Number of CPU threads specified both directly on the command-line and as a tag to the nt argument. Please specify only one or the other.");
else if(tags.containsKey("cpu"))
numCPUThreads = Integer.parseInt(tags.getValue("cpu"));
else if(argCollection.numberOfCPUThreads != null)
numCPUThreads = argCollection.numberOfCPUThreads;
Integer numIOThreads = null;
if(tags.containsKey("io") && argCollection.numberOfIOThreads != null)
throw new UserException("Number of IO threads specified both directly on the command-line and as a tag to the nt argument. Please specify only one or the other.");
else if(tags.containsKey("io"))
numIOThreads = Integer.parseInt(tags.getValue("io"));
else if(argCollection.numberOfIOThreads != null)
numIOThreads = argCollection.numberOfIOThreads;
this.threadAllocation = new ThreadAllocation(argCollection.numberOfThreads,numCPUThreads,numIOThreads);
this.threadAllocation = new ThreadAllocation(argCollection.numberOfDataThreads,
argCollection.numberOfCPUThreadsPerDataThread,
argCollection.numberOfIOThreads,
! argCollection.disableEfficiencyMonitor);
}
public int getTotalNumberOfThreads() {
return this.threadAllocation == null ? 1 : threadAllocation.getTotalNumThreads();
}
/**
* Allow subclasses and others within this package direct access to the walker manager.
* @return The walker manager used by this package.
@ -400,23 +442,24 @@ public class GenomeAnalysisEngine {
protected DownsamplingMethod getDownsamplingMethod() {
GATKArgumentCollection argCollection = this.getArguments();
DownsamplingMethod method;
if(argCollection.getDownsamplingMethod() != null)
method = argCollection.getDownsamplingMethod();
else if(WalkerManager.getDownsamplingMethod(walker) != null)
method = WalkerManager.getDownsamplingMethod(walker);
else
method = GATKArgumentCollection.getDefaultDownsamplingMethod();
return method;
boolean useExperimentalDownsampling = argCollection.enableExperimentalDownsampling;
// until the file pointer bug with the experimental downsamplers is fixed, disallow running with experimental downsampling
if ( useExperimentalDownsampling ) {
throw new UserException("The experimental downsampling implementation is currently crippled by a file-pointer-related bug. Until this bug is fixed, it's not safe (or possible) for anyone to use the experimental implementation!");
}
DownsamplingMethod commandLineMethod = argCollection.getDownsamplingMethod();
DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker, useExperimentalDownsampling);
DownsamplingMethod defaultMethod = DownsamplingMethod.getDefaultDownsamplingMethod(walker, useExperimentalDownsampling);
return commandLineMethod != null ? commandLineMethod : (walkerMethod != null ? walkerMethod : defaultMethod);
}
protected void setDownsamplingMethod(DownsamplingMethod method) {
argCollection.setDownsamplingMethod(method);
}
public BAQ.QualityMode getWalkerBAQQualityMode() { return WalkerManager.getBAQQualityMode(walker); }
public BAQ.ApplicationTime getWalkerBAQApplicationTime() { return WalkerManager.getBAQApplicationTime(walker); }
protected boolean includeReadsWithDeletionAtLoci() {
return walker.includeReadsWithDeletionAtLoci();
}
@ -697,13 +740,12 @@ public class GenomeAnalysisEngine {
protected void initializeDataSources() {
logger.info("Strictness is " + argCollection.strictnessLevel);
// TODO -- REMOVE ME
BAQ.DEFAULT_GOP = argCollection.BAQGOP;
validateSuppliedReference();
setReferenceDataSource(argCollection.referenceFile);
validateSuppliedReads();
initializeReadTransformers(walker);
readsDataSource = createReadsDataSource(argCollection,genomeLocParser,referenceDataSource.getReference());
for (ReadFilter filter : filters)
@ -784,14 +826,13 @@ public class GenomeAnalysisEngine {
* @return A data source for the given set of reads.
*/
private SAMDataSource createReadsDataSource(GATKArgumentCollection argCollection, GenomeLocParser genomeLocParser, IndexedFastaSequenceFile refReader) {
DownsamplingMethod method = getDownsamplingMethod();
DownsamplingMethod downsamplingMethod = getDownsamplingMethod();
// Synchronize the method back into the collection so that it shows up when
// interrogating for the downsample method during command line recreation.
setDownsamplingMethod(method);
setDownsamplingMethod(downsamplingMethod);
if ( getWalkerBAQApplicationTime() == BAQ.ApplicationTime.FORBIDDEN && argCollection.BAQMode != BAQ.CalculationMode.OFF)
throw new UserException.BadArgumentValue("baq", "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + argCollection.BAQMode + " was requested.");
logger.info(downsamplingMethod);
if (argCollection.removeProgramRecords && argCollection.keepProgramRecords)
throw new UserException.BadArgumentValue("rpr / kpr", "Cannot enable both options");
@ -809,14 +850,11 @@ public class GenomeAnalysisEngine {
argCollection.useOriginalBaseQualities,
argCollection.strictnessLevel,
argCollection.readBufferSize,
method,
downsamplingMethod,
new ValidationExclusion(Arrays.asList(argCollection.unsafe)),
filters,
readTransformers,
includeReadsWithDeletionAtLoci(),
getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF,
getWalkerBAQQualityMode(),
refReader,
getBaseRecalibration(),
argCollection.defaultBaseQualities,
removeProgramRecords);
}
@ -1000,7 +1038,19 @@ public class GenomeAnalysisEngine {
* owned by the caller; the caller can do with the object what they wish.
*/
public ReadMetrics getCumulativeMetrics() {
return readsDataSource == null ? null : readsDataSource.getCumulativeReadMetrics();
// todo -- probably shouldn't be lazy
if ( cumulativeMetrics == null )
cumulativeMetrics = readsDataSource == null ? new ReadMetrics() : readsDataSource.getCumulativeReadMetrics();
return cumulativeMetrics;
}
/**
* Return the global ThreadEfficiencyMonitor, if there is one
*
* @return the monitor, or null if none is active
*/
public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() {
return threadEfficiencyMonitor;
}
// -------------------------------------------------------------------------------------

View File

@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk;
import net.sf.picard.filter.SamRecordFilter;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;
@ -119,11 +118,18 @@ public class ReadMetrics implements Cloneable {
return nRecords;
}
/**
* Increments the number of 'iterations' (one call of filter/map/reduce sequence) completed.
*/
public void incrementNumIterations(final long by) {
nRecords += by;
}
/**
* Increments the number of 'iterations' (one call of filter/map/reduce sequence) completed.
*/
public void incrementNumIterations() {
nRecords++;
incrementNumIterations(1);
}
public long getNumReadsSeen() {

View File

@ -1,15 +1,15 @@
package org.broadinstitute.sting.gatk;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import java.util.Collection;
import java.util.List;
/**
* User: hanna
* Date: May 14, 2009
@ -34,12 +34,9 @@ public class ReadProperties {
private final DownsamplingMethod downsamplingMethod;
private final ValidationExclusion exclusionList;
private final Collection<ReadFilter> supplementalFilters;
private final List<ReadTransformer> readTransformers;
private final boolean includeReadsWithDeletionAtLoci;
private final boolean useOriginalBaseQualities;
private final BAQ.CalculationMode cmode;
private final BAQ.QualityMode qmode;
private final IndexedFastaSequenceFile refReader; // read for BAQ, if desired
private final BaseRecalibration bqsrApplier;
private final byte defaultBaseQualities;
/**
@ -95,6 +92,11 @@ public class ReadProperties {
return supplementalFilters;
}
public List<ReadTransformer> getReadTransformers() {
return readTransformers;
}
/**
* Return whether to use original base qualities.
* @return Whether to use original base qualities.
@ -103,16 +105,6 @@ public class ReadProperties {
return useOriginalBaseQualities;
}
public BAQ.QualityMode getBAQQualityMode() { return qmode; }
public BAQ.CalculationMode getBAQCalculationMode() { return cmode; }
public IndexedFastaSequenceFile getRefReader() {
return refReader;
}
public BaseRecalibration getBQSRApplier() { return bqsrApplier; }
/**
* @return Default base quality value to fill reads missing base quality information.
*/
@ -134,9 +126,6 @@ public class ReadProperties {
* @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method
* will explicitly list reads with deletion over the current reference base; otherwise, only observed
* bases will be seen in the pileups, and the deletions will be skipped silently.
* @param cmode How should we apply the BAQ calculation to the reads?
* @param qmode How should we apply the BAQ calculation to the reads?
* @param refReader if applyBAQ is true, must be a valid pointer to a indexed fasta file reads so we can get the ref bases for BAQ calculation
* @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality.
*/
public ReadProperties( Collection<SAMReaderID> samFiles,
@ -146,11 +135,8 @@ public class ReadProperties {
DownsamplingMethod downsamplingMethod,
ValidationExclusion exclusionList,
Collection<ReadFilter> supplementalFilters,
List<ReadTransformer> readTransformers,
boolean includeReadsWithDeletionAtLoci,
BAQ.CalculationMode cmode,
BAQ.QualityMode qmode,
IndexedFastaSequenceFile refReader,
BaseRecalibration bqsrApplier,
byte defaultBaseQualities) {
this.readers = samFiles;
this.header = header;
@ -158,12 +144,9 @@ public class ReadProperties {
this.downsamplingMethod = downsamplingMethod == null ? DownsamplingMethod.NONE : downsamplingMethod;
this.exclusionList = exclusionList == null ? new ValidationExclusion() : exclusionList;
this.supplementalFilters = supplementalFilters;
this.readTransformers = readTransformers;
this.includeReadsWithDeletionAtLoci = includeReadsWithDeletionAtLoci;
this.useOriginalBaseQualities = useOriginalBaseQualities;
this.cmode = cmode;
this.qmode = qmode;
this.refReader = refReader;
this.bqsrApplier = bqsrApplier;
this.defaultBaseQualities = defaultBaseQualities;
}
}

View File

@ -27,15 +27,18 @@ package org.broadinstitute.sting.gatk;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.filters.FilterManager;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.help.ResourceBundleExtractorDoclet;
import org.broadinstitute.sting.utils.text.TextFormattingUtils;
import java.lang.annotation.Annotation;
import java.util.*;
/**
@ -303,9 +306,10 @@ public class WalkerManager extends PluginManager<Walker> {
* downsampling method is specified on the command-line, the command-line version will
* be used instead.
* @param walkerClass The class of the walker to interrogate.
* @param useExperimentalDownsampling If true, use the experimental downsampling implementation
* @return The downsampling method, as specified by the walker. Null if none exists.
*/
public static DownsamplingMethod getDownsamplingMethod(Class<? extends Walker> walkerClass) {
public static DownsamplingMethod getDownsamplingMethod(Class<? extends Walker> walkerClass, boolean useExperimentalDownsampling) {
DownsamplingMethod downsamplingMethod = null;
if( walkerClass.isAnnotationPresent(Downsample.class) ) {
@ -313,17 +317,17 @@ public class WalkerManager extends PluginManager<Walker> {
DownsampleType type = downsampleParameters.by();
Integer toCoverage = downsampleParameters.toCoverage() >= 0 ? downsampleParameters.toCoverage() : null;
Double toFraction = downsampleParameters.toFraction() >= 0.0d ? downsampleParameters.toFraction() : null;
downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction);
downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction,useExperimentalDownsampling);
}
return downsamplingMethod;
}
public static BAQ.QualityMode getBAQQualityMode(Walker walker) {
return walker.getClass().getAnnotation(BAQMode.class).QualityMode();
public static <T extends Annotation> T getWalkerAnnotation(final Walker walker, final Class<T> clazz) {
return walker.getClass().getAnnotation(clazz);
}
public static BAQ.ApplicationTime getBAQApplicationTime(Walker walker) {
public static ReadTransformer.ApplicationTime getBAQApplicationTime(Walker walker) {
return walker.getClass().getAnnotation(BAQMode.class).ApplicationTime();
}
@ -332,10 +336,11 @@ public class WalkerManager extends PluginManager<Walker> {
* downsampling method is specified on the command-line, the command-line version will
* be used instead.
* @param walker The walker to interrogate.
* @param useExperimentalDownsampling If true, use the experimental downsampling implementation
* @return The downsampling method, as specified by the walker. Null if none exists.
*/
public static DownsamplingMethod getDownsamplingMethod(Walker walker) {
return getDownsamplingMethod(walker.getClass());
public static DownsamplingMethod getDownsamplingMethod(Walker walker, boolean useExperimentalDownsampling) {
return getDownsamplingMethod(walker.getClass(), useExperimentalDownsampling);
}
/**

View File

@ -31,8 +31,8 @@ import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.IntervalBinding;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.DownsamplingMethod;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport;
import org.broadinstitute.sting.gatk.samples.PedigreeValidationType;
import org.broadinstitute.sting.utils.QualityUtils;
@ -41,7 +41,9 @@ import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
import java.io.File;
import java.util.*;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* @author aaron
@ -138,15 +140,11 @@ public class GATKArgumentCollection {
@Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false)
public boolean nonDeterministicRandomSeed = false;
/**
* The override mechanism in the GATK, by default, populates the command-line arguments, then
* the defaults from the walker annotations. Unfortunately, walker annotations should be trumped
* by a user explicitly specifying command-line arguments.
* TODO: Change the GATK so that walker defaults are loaded first, then command-line arguments.
*/
private static DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE;
private static int DEFAULT_DOWNSAMPLING_COVERAGE = 1000;
// --------------------------------------------------------------------------------------------------------------
//
// Downsampling Arguments
//
// --------------------------------------------------------------------------------------------------------------
@Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here", required = false)
public DownsampleType downsamplingType = null;
@ -156,17 +154,20 @@ public class GATKArgumentCollection {
@Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus", required = false)
public Integer downsampleCoverage = null;
@Argument(fullName = "enable_experimental_downsampling", shortName = "enable_experimental_downsampling", doc = "Enable experimental engine-level downsampling", required = false)
@Hidden
public boolean enableExperimentalDownsampling = false;
/**
* Gets the downsampling method explicitly specified by the user. If the user didn't specify
* a default downsampling mechanism, return the default.
* @return The explicitly specified downsampling mechanism, or the default if none exists.
*/
public DownsamplingMethod getDownsamplingMethod() {
if(downsamplingType == null && downsampleFraction == null && downsampleCoverage == null)
if ( downsamplingType == null && downsampleFraction == null && downsampleCoverage == null )
return null;
if(downsamplingType == null && downsampleCoverage != null)
return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,downsampleCoverage,null);
return new DownsamplingMethod(downsamplingType,downsampleCoverage,downsampleFraction);
return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction, enableExperimentalDownsampling);
}
/**
@ -176,9 +177,11 @@ public class GATKArgumentCollection {
public void setDownsamplingMethod(DownsamplingMethod method) {
if (method == null)
throw new IllegalArgumentException("method is null");
downsamplingType = method.type;
downsampleCoverage = method.toCoverage;
downsampleFraction = method.toFraction;
enableExperimentalDownsampling = method.useExperimentalDownsampling;
}
// --------------------------------------------------------------------------------------------------------------
@ -197,17 +200,14 @@ public class GATKArgumentCollection {
// performance log arguments
//
// --------------------------------------------------------------------------------------------------------------
@Argument(fullName = "performanceLog", shortName="PF", doc="If provided, a GATK runtime performance log will be written to this file", required = false)
public File performanceLog = null;
/**
* Gets the default downsampling method, returned if the user didn't specify any downsampling
* method.
* @return The default downsampling mechanism, or null if none exists.
* The file name for the GATK performance log output, or null if you don't want to generate the
* detailed performance logging table. This table is suitable for importing into R or any
* other analysis software that can read tsv files
*/
public static DownsamplingMethod getDefaultDownsamplingMethod() {
return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,DEFAULT_DOWNSAMPLING_COVERAGE,null);
}
@Argument(fullName = "performanceLog", shortName="PF", doc="If provided, a GATK runtime performance log will be written to this file", required = false)
public File performanceLog = null;
@Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "If set, use the original base quality scores from the OQ tag when present instead of the standard scores", required=false)
public Boolean useOriginalBaseQualities = false;
@ -279,20 +279,40 @@ public class GATKArgumentCollection {
@Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false)
public ValidationExclusion.TYPE unsafe;
/** How many threads should be allocated to this analysis. */
@Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false)
public Integer numberOfThreads = 1;
// --------------------------------------------------------------------------------------------------------------
//
// Multi-threading arguments
//
// --------------------------------------------------------------------------------------------------------------
/**
* The following two arguments (num_cpu_threads, num_io_threads are TEMPORARY since Queue cannot currently support arbitrary tagged data types.
* TODO: Kill this when I can do a tagged integer in Queue.
* How many data threads should be allocated to this analysis? Data threads contains N cpu threads per
* data thread, and act as completely data parallel processing, increasing the memory usage of GATK
* by M data threads. Data threads generally scale extremely effectively, up to 24 cores
*/
@Argument(fullName="num_cpu_threads", shortName = "nct", doc="How many of the given threads should be allocated to the CPU", required = false)
@Hidden
public Integer numberOfCPUThreads = null;
@Argument(fullName = "num_threads", shortName = "nt", doc = "How many data threads should be allocated to running this analysis.", required = false)
public Integer numberOfDataThreads = 1;
/**
* How many CPU threads should be allocated per data thread? Each CPU thread operates the map
* cycle independently, but may run into earlier scaling problems with IO than data threads. Has
* the benefit of not requiring X times as much memory per thread as data threads do, but rather
* only a constant overhead.
*/
@Argument(fullName="num_cpu_threads_per_data_thread", shortName = "nct", doc="How many CPU threads should be allocated per data thread to running this analysis?", required = false)
public int numberOfCPUThreadsPerDataThread = 1;
@Argument(fullName="num_io_threads", shortName = "nit", doc="How many of the given threads should be allocated to IO", required = false)
@Hidden
public Integer numberOfIOThreads = null;
public int numberOfIOThreads = 0;
/**
* By default the GATK monitors its own efficiency, but this can have a itsy-bitsy tiny
* cost (< 0.1%) in runtime because of turning on the JavaBean. This argument allows you
* to disable the monitor
*/
@Argument(fullName = "disableThreadEfficiencyMonitor", shortName = "dtem", doc = "Disable GATK efficiency monitoring", required = false)
public Boolean disableEfficiencyMonitor = false;
@Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false)
public Integer numberOfBAMFileHandles = null;

View File

@ -177,7 +177,7 @@ public class ReferenceContext {
* @return The base at the given locus from the reference.
*/
public byte getBase() {
return getBases()[(int)(locus.getStart() - window.getStart())];
return getBases()[(locus.getStart() - window.getStart())];
}
/**

View File

@ -0,0 +1,143 @@
package org.broadinstitute.sting.gatk.datasources.providers;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.picard.util.PeekableIterator;
import org.broadinstitute.sting.gatk.refdata.RODRecordListImpl;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.util.Collection;
import java.util.LinkedList;
import java.util.ListIterator;
/**
* Key algorithmic helper for ReadBasedReferenceOrderedData
*
* Takes a single iterator of features, and provides a single capability that returns
* the list of RODs that overlap an interval. Allows sequential getOverlapping calls
* from intervals provided that these intervals always have increasing getStart() values.
*
*/
class IntervalOverlappingRODsFromStream {
/**
* Only held for QC purposes
*/
GenomeLoc lastQuery = null;
private final String name;
private final LinkedList<GATKFeature> currentFeatures = new LinkedList<GATKFeature>();
private final PeekableIterator<RODRecordList> futureFeatures;
/**
* Create a new IntervalOverlappingRODsFromStream that reads elements from futureFeatures and
* returns RODRecordLists having name
*
* @param name
* @param futureFeatures
*/
IntervalOverlappingRODsFromStream(final String name, final PeekableIterator<RODRecordList> futureFeatures) {
if ( futureFeatures == null ) throw new IllegalArgumentException("futureFeatures cannot be null");
this.name = name;
this.futureFeatures = futureFeatures;
}
/**
* Get the list of RODs overlapping loc from this stream of RODs.
*
* Sequential calls to this function must obey the rule that loc2.getStart >= loc1.getStart
*
* @param loc the interval to query
* @return a non-null RODRecordList containing the overlapping RODs, which may be empty
*/
@Ensures({"overlaps(loc, result)",
"! futureFeatures.hasNext() || futureFeatures.peek().getLocation().isPast(loc)",
"result != null"})
public RODRecordList getOverlapping(final GenomeLoc loc) {
if ( lastQuery != null && loc.getStart() < lastQuery.getStart() )
throw new IllegalArgumentException(String.format("BUG: query interval (%s) starts before the previous interval %s", loc, lastQuery));
trimCurrentFeaturesToLoc(loc);
readOverlappingFutureFeatures(loc);
return new RODRecordListImpl(name, subsetToOverlapping(loc, currentFeatures), loc);
}
/**
* For contract assurance. Checks that all bindings in loc overlap
*
* @param loc
* @param bindings
* @return
*/
@Requires({"loc != null", "bindings != null"})
private boolean overlaps(final GenomeLoc loc, final RODRecordList bindings) {
for ( final GATKFeature feature : bindings )
if ( ! feature.getLocation().overlapsP(loc) )
return false;
return true;
}
/**
* Subset the features in all to those that overlap with loc
*
* The current features list contains everything read that cannot be thrown away yet, but not
* everything in there necessarily overlaps with loc. Subset to just those that do overlap
*
* @param loc the location that features must overlap
* @param all the list of all features
* @return a subset of all that overlaps with loc
*/
@Requires({"loc != null", "all != null"})
@Ensures("result.size() <= all.size()")
private Collection<GATKFeature> subsetToOverlapping(final GenomeLoc loc, final Collection<GATKFeature> all) {
final LinkedList<GATKFeature> overlapping = new LinkedList<GATKFeature>();
for ( final GATKFeature feature : all )
if ( feature.getLocation().overlapsP(loc) )
overlapping.add(feature);
return overlapping;
}
/**
* Update function. Remove all elements of currentFeatures that end before loc
*
* @param loc the location to use
*/
@Requires("loc != null")
@Ensures("currentFeatures.size() <= old(currentFeatures.size())")
private void trimCurrentFeaturesToLoc(final GenomeLoc loc) {
final ListIterator<GATKFeature> it = currentFeatures.listIterator();
while ( it.hasNext() ) {
final GATKFeature feature = it.next();
if ( feature.getLocation().isBefore(loc) )
it.remove();
}
}
/**
* Update function: Read all elements from futureFeatures that overlap with loc
*
* Stops at the first element that starts before the end of loc, or the stream empties
*
* @param loc
*/
@Requires("loc != null")
@Ensures("currentFeatures.size() >= old(currentFeatures.size())")
private void readOverlappingFutureFeatures(final GenomeLoc loc) {
while ( futureFeatures.hasNext() ) {
final GenomeLoc nextLoc = futureFeatures.peek().getLocation();
if ( nextLoc.isBefore(loc) ) {
futureFeatures.next(); // next rod element is before loc, throw it away and keep looking
} else if ( nextLoc.isPast(loc) ) {
break; // next element is past loc, stop looking but don't pop it
} else if ( nextLoc.overlapsP(loc) ) {
// add overlapping elements to our current features, removing from stream
for ( final GATKFeature feature : futureFeatures.next() ) {
currentFeatures.add(feature);
}
}
}
}
}

View File

@ -1,6 +1,6 @@
package org.broadinstitute.sting.gatk.datasources.providers;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
@ -135,8 +135,13 @@ public abstract class LocusView extends LocusIterator implements View {
// Cache the current and apply filtering.
AlignmentContext current = nextLocus;
if( sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null )
// The old ALL_READS downsampling implementation -- only use if we're not using the new experimental downsampling:
if( ! sourceInfo.getDownsamplingMethod().useExperimentalDownsampling &&
sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null ) {
current.downsampleToCoverage( sourceInfo.getDownsamplingMethod().toCoverage );
}
// Indicate that the next operation will need to advance.
nextLocus = null;

View File

@ -58,7 +58,7 @@ public class ManagingReferenceOrderedView implements ReferenceOrderedView {
// todo -- warning, I removed the reference to the name from states
bindings.add( state.iterator.seekForward(loc) );
return new RefMetaDataTracker(bindings, referenceContext);
return new RefMetaDataTracker(bindings);
}
/**

View File

@ -23,40 +23,63 @@
package org.broadinstitute.sting.gatk.datasources.providers;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.datasources.reads.ReadShard;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.TreeMap;
/** a ROD view for reads. This provides the Read traversals a way of getting a ReadMetaDataTracker */
/** a ROD view for reads. This provides the Read traversals a way of getting a RefMetaDataTracker */
public class ReadBasedReferenceOrderedView implements View {
private final WindowedData window;
public ReadBasedReferenceOrderedView(ShardDataProvider provider) {
window = new WindowedData(provider);
provider.register(this);
}
// a list of the RMDDataState (location->iterators)
private final List<RMDDataState> states = new ArrayList<RMDDataState>(1);
private final static RefMetaDataTracker EMPTY_TRACKER = new RefMetaDataTracker();
/**
* for testing only please
*
* @param data the window provider
* Used to get genome locs for reads
*/
ReadBasedReferenceOrderedView(WindowedData data) {
window = data;
private final GenomeLocParser genomeLocParser;
/**
* The total extent of all reads in this span. We create iterators from our RODs
* from the start of this span, to the end.
*/
private final GenomeLoc shardSpan;
public ReadBasedReferenceOrderedView(final ShardDataProvider provider) {
this.genomeLocParser = provider.getGenomeLocParser();
// conditional to optimize the case where we don't have any ROD data
this.shardSpan = provider.getReferenceOrderedData() != null ? ((ReadShard)provider.getShard()).getReadsSpan() : null;
provider.register(this);
if ( provider.getReferenceOrderedData() != null && ! shardSpan.isUnmapped() ) {
for (ReferenceOrderedDataSource dataSource : provider.getReferenceOrderedData())
states.add(new RMDDataState(dataSource, dataSource.seek(shardSpan)));
}
}
public ReadMetaDataTracker getReferenceOrderedDataForRead(SAMRecord read) {
return window.getTracker(read);
/**
* Testing constructor
*/
protected ReadBasedReferenceOrderedView(final GenomeLocParser genomeLocParser,
final GenomeLoc shardSpan,
final List<String> names,
final List<PeekableIterator<RODRecordList>> featureSources) {
this.genomeLocParser = genomeLocParser;
this.shardSpan = shardSpan;
for ( int i = 0; i < names.size(); i++ )
states.add(new RMDDataState(names.get(i), featureSources.get(i)));
}
public Collection<Class<? extends View>> getConflictingViews() {
@ -65,135 +88,72 @@ public class ReadBasedReferenceOrderedView implements View {
return classes;
}
public void close() {
if (window != null) window.close();
}
}
/** stores a window of data, dropping RODs if we've passed the new reads start point. */
class WindowedData {
// the queue of possibly in-frame RODs; RODs are removed as soon as they are out of scope
private final TreeMap<Integer, RODMetaDataContainer> mapping = new TreeMap<Integer, RODMetaDataContainer>();
// our current location from the last read we processed
private GenomeLoc currentLoc;
// a list of the RMDDataState (location->iterators)
private List<RMDDataState> states;
// the provider; where we get all our information
private final ShardDataProvider provider;
/**
* our log, which we want to capture anything from this class
*/
private static Logger logger = Logger.getLogger(WindowedData.class);
/**
* create a WindowedData given a shard provider
*
* @param provider the ShardDataProvider
*/
public WindowedData(ShardDataProvider provider) {
this.provider = provider;
}
/**
* load the states dynamically, since the only way to get a genome loc is from the read (the shard doesn't have one)
*
* @param provider the ShardDataProvider
* @param rec the current read
*/
private void getStates(ShardDataProvider provider, SAMRecord rec) {
int stop = Integer.MAX_VALUE;
// figure out the appropriate alignment stop
if (provider.hasReference()) {
stop = provider.getReference().getSequenceDictionary().getSequence(rec.getReferenceIndex()).getSequenceLength();
}
// calculate the range of positions we need to look at
GenomeLoc range = provider.getGenomeLocParser().createGenomeLoc(rec.getReferenceName(),
rec.getAlignmentStart(),
stop);
states = new ArrayList<RMDDataState>();
if (provider.getReferenceOrderedData() != null)
for (ReferenceOrderedDataSource dataSource : provider.getReferenceOrderedData())
states.add(new RMDDataState(dataSource, dataSource.seek(range)));
}
/**
* this function is for testing only
*
* @param states a list of RMDDataState to initialize with
*/
WindowedData(List<RMDDataState> states) {
this.states = states;
provider = null;
}
/**
* create a ReadMetaDataTracker given the current read
* create a RefMetaDataTracker given the current read
*
* @param rec the read
*
* @return a ReadMetaDataTracker for the read, from which you can get ROD -> read alignments
* @return a RefMetaDataTracker for the read, from which you can get ROD -> read alignments
*/
public ReadMetaDataTracker getTracker(SAMRecord rec) {
updatePosition(rec);
return new ReadMetaDataTracker(provider.getGenomeLocParser(), rec, mapping);
@Requires("rec != null")
@Ensures("result != null")
public RefMetaDataTracker getReferenceOrderedDataForRead(final SAMRecord rec) {
if ( rec.getReadUnmappedFlag() )
// empty RODs for unmapped reads
return new RefMetaDataTracker();
else
return getReferenceOrderedDataForInterval(genomeLocParser.createGenomeLoc(rec));
}
/**
* update the position we're storing
*
* @param rec the read to use for start and end
*/
private void updatePosition(SAMRecord rec) {
if (states == null) getStates(this.provider, rec);
currentLoc = provider.getGenomeLocParser().createGenomeLoc(rec);
// flush the queue looking for records we've passed over
while (mapping.size() > 0 && mapping.firstKey() < currentLoc.getStart())
mapping.pollFirstEntry(); // toss away records that we've passed
// add new data to the queue
for (RMDDataState state : states) {
// move into position
while (state.iterator.hasNext() && state.iterator.peekNextLocation().isBefore(currentLoc))
state.iterator.next();
while (state.iterator.hasNext() && state.iterator.peekNextLocation().overlapsP(currentLoc)) {
RODRecordList list = state.iterator.next();
for (GATKFeature datum : list) {
if (!mapping.containsKey(list.getLocation().getStart()))
mapping.put(list.getLocation().getStart(), new RODMetaDataContainer());
mapping.get(list.getLocation().getStart()).addEntry(datum);
}
}
@Requires({"interval != null", "shardSpan == null || shardSpan.isUnmapped() || shardSpan.containsP(interval)"})
@Ensures("result != null")
public RefMetaDataTracker getReferenceOrderedDataForInterval(final GenomeLoc interval) {
if ( states.isEmpty() || shardSpan.isUnmapped() ) // optimization for no bindings (common for read walkers)
return EMPTY_TRACKER;
else {
final List<RODRecordList> bindings = new ArrayList<RODRecordList>(states.size());
for ( final RMDDataState state : states )
bindings.add(state.stream.getOverlapping(interval));
return new RefMetaDataTracker(bindings);
}
}
/** Closes the current view. */
/**
* Closes the current view.
*/
public void close() {
if (states == null) return;
for (RMDDataState state : states)
state.dataSource.close( state.iterator );
for (final RMDDataState state : states)
state.close();
// Clear out the existing data so that post-close() accesses to this data will fail-fast.
states = null;
states.clear();
}
/** Models the traversal state of a given ROD lane. */
private static class RMDDataState {
public final ReferenceOrderedDataSource dataSource;
public final IntervalOverlappingRODsFromStream stream;
private final LocationAwareSeekableRODIterator iterator;
}
public RMDDataState(ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator) {
this.dataSource = dataSource;
this.iterator = iterator;
this.stream = new IntervalOverlappingRODsFromStream(dataSource.getName(), new PeekableIterator<RODRecordList>(iterator));
}
/** Models the traversal state of a given ROD lane. */
class RMDDataState {
public final ReferenceOrderedDataSource dataSource;
public final LocationAwareSeekableRODIterator iterator;
/**
* For testing
*/
public RMDDataState(final String name, final PeekableIterator<RODRecordList> iterator) {
this.dataSource = null;
this.iterator = null;
this.stream = new IntervalOverlappingRODsFromStream(name, new PeekableIterator<RODRecordList>(iterator));
}
public RMDDataState(ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator) {
this.dataSource = dataSource;
this.iterator = iterator;
public void close() {
if ( dataSource != null )
dataSource.close( iterator );
}
}
}

View File

@ -59,16 +59,18 @@ public class ReadReferenceView extends ReferenceView {
}
public byte[] getBases() {
// System.out.printf("Getting bases for location %s%n", loc);
// throw new StingException("x");
return getReferenceBases(loc);
}
}
public ReferenceContext getReferenceContext( SAMRecord read ) {
/**
* Return a reference context appropriate for the span of read
*
* @param read the mapped read to test
* @return
*/
public ReferenceContext getReferenceContext( final SAMRecord read ) {
GenomeLoc loc = genomeLocParser.createGenomeLoc(read);
// byte[] bases = super.getReferenceBases(loc);
// return new ReferenceContext( loc, loc, bases );
return new ReferenceContext( genomeLocParser, loc, loc, getReferenceBasesProvider(loc) );
}

View File

@ -101,7 +101,7 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView {
public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc, ReferenceContext referenceContext ) {
// special case the interval again -- add it into the ROD
if ( interval != null ) { allTracksHere.add(interval); }
return new RefMetaDataTracker(allTracksHere, referenceContext);
return new RefMetaDataTracker(allTracksHere);
}
public boolean hasNext() {

View File

@ -94,6 +94,13 @@ public abstract class ShardDataProvider {
return referenceOrderedData;
}
/**
* @return true if reference ordered data will be provided by this shard
*/
public boolean hasReferenceOrderedData() {
return ! getReferenceOrderedData().isEmpty();
}
/**
* Create a data provider for the shard given the reads and reference.
* @param shard The chunk of data over which traversals happen.

View File

@ -6,11 +6,9 @@ import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.*;
/**
*
@ -58,6 +56,15 @@ public class ReadShard extends Shard {
MAX_READS = bufferSize;
}
/**
* What read buffer size are we using?
*
* @return
*/
public static int getReadBufferSize() {
return MAX_READS;
}
/**
* Returns true if this shard is meant to buffer reads, rather
* than just holding pointers to their locations.
@ -116,4 +123,33 @@ public class ReadShard extends Shard {
}
return sb.toString();
}
/**
* Get the full span from the start of the left most read to the end of the right most one
*
* Note this may be different than the getLocation() of the shard, as this reflects the
* targeted span, not the actual span of reads
*
* @return the genome loc representing the span of these reads on the genome
*/
public GenomeLoc getReadsSpan() {
if ( isUnmapped() || super.getGenomeLocs() == null || reads.isEmpty() )
return super.getLocation();
else {
int start = Integer.MAX_VALUE;
int stop = Integer.MIN_VALUE;
String contig = null;
for ( final SAMRecord read : reads ) {
if ( contig != null && ! read.getReferenceName().equals(contig) )
throw new ReviewedStingException("ReadShard contains reads spanning contig boundaries, which is no longer allowed. "
+ "First contig is " + contig + " next read was " + read.getReferenceName() );
contig = read.getReferenceName();
if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart();
if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd();
}
return parser.createGenomeLoc(contig, start, stop);
}
}
}

View File

@ -24,14 +24,15 @@
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.sam.MergingSamRecordIterator;
import net.sf.picard.sam.SamFileHeaderMerger;
import net.sf.samtools.*;
import net.sf.samtools.util.CloseableIterator;
import net.sf.samtools.util.RuntimeIOException;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.DownsamplingMethod;
import org.broadinstitute.sting.gatk.downsampling.*;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.ReadMetrics;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
@ -42,12 +43,9 @@ import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.SimpleTimer;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.baq.BAQSamIterator;
import org.broadinstitute.sting.utils.baq.ReadTransformingIterator;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.recalibration.BQSRSamIterator;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory;
import java.io.File;
@ -156,6 +154,8 @@ public class SAMDataSource {
*/
private final ThreadAllocation threadAllocation;
private final boolean expandShardsForDownsampling;
/**
* Create a new SAM data source given the supplied read metadata.
* @param samFiles list of reads files.
@ -200,11 +200,8 @@ public class SAMDataSource {
downsamplingMethod,
exclusionList,
supplementalFilters,
Collections.<ReadTransformer>emptyList(),
includeReadsWithDeletionAtLoci,
BAQ.CalculationMode.OFF,
BAQ.QualityMode.DONT_MODIFY,
null, // no BAQ
null, // no BQSR
(byte) -1,
false);
}
@ -234,11 +231,8 @@ public class SAMDataSource {
DownsamplingMethod downsamplingMethod,
ValidationExclusion exclusionList,
Collection<ReadFilter> supplementalFilters,
List<ReadTransformer> readTransformers,
boolean includeReadsWithDeletionAtLoci,
BAQ.CalculationMode cmode,
BAQ.QualityMode qmode,
IndexedFastaSequenceFile refReader,
BaseRecalibration bqsrApplier,
byte defaultBaseQualities,
boolean removeProgramRecords) {
this.readMetrics = new ReadMetrics();
@ -262,7 +256,7 @@ public class SAMDataSource {
else {
// Choose a sensible default for the read buffer size. For the moment, we're picking 1000 reads per BAM per shard (which effectively
// will mean per-thread once ReadWalkers are parallelized) with a max cap of 250K reads in memory at once.
ReadShard.setReadBufferSize(Math.min(1000*samFiles.size(),250000));
ReadShard.setReadBufferSize(Math.min(10000*samFiles.size(),250000));
}
resourcePool = new SAMResourcePool(Integer.MAX_VALUE);
@ -308,13 +302,15 @@ public class SAMDataSource {
downsamplingMethod,
exclusionList,
supplementalFilters,
readTransformers,
includeReadsWithDeletionAtLoci,
cmode,
qmode,
refReader,
bqsrApplier,
defaultBaseQualities);
expandShardsForDownsampling = readProperties.getDownsamplingMethod() != null &&
readProperties.getDownsamplingMethod().useExperimentalDownsampling &&
readProperties.getDownsamplingMethod().type != DownsampleType.NONE &&
readProperties.getDownsamplingMethod().toCoverage != null;
// cache the read group id (original) -> read group id (merged)
// and read group id (merged) -> read group id (original) mappings.
for(SAMReaderID id: readerIDs) {
@ -470,6 +466,16 @@ public class SAMDataSource {
}
}
/**
* Are we expanding shards as necessary to prevent shard boundaries from occurring at improper places?
*
* @return true if we are using expanded shards, otherwise false
*/
public boolean usingExpandedShards() {
return expandShardsForDownsampling;
}
/**
* Fill the given buffering shard with reads.
* @param shard Shard to fill.
@ -486,9 +492,40 @@ public class SAMDataSource {
CloseableIterator<SAMRecord> iterator = getIterator(readers,shard,sortOrder == SAMFileHeader.SortOrder.coordinate);
while(!shard.isBufferFull() && iterator.hasNext()) {
read = iterator.next();
shard.addRead(read);
noteFilePositionUpdate(positionUpdates,read);
final SAMRecord nextRead = iterator.next();
if ( read == null || (nextRead.getReferenceIndex().equals(read.getReferenceIndex())) ) {
// only add reads to the shard if they are on the same contig
read = nextRead;
shard.addRead(read);
noteFilePositionUpdate(positionUpdates,read);
} else {
break;
}
}
// If the reads are sorted in coordinate order, ensure that all reads
// having the same alignment start become part of the same shard, to allow
// downsampling to work better across shard boundaries. Note that because our
// read stream has already been fed through the positional downsampler, which
// ensures that at each alignment start position there are no more than dcov
// reads, we're in no danger of accidentally creating a disproportionately huge
// shard
if ( expandShardsForDownsampling && sortOrder == SAMFileHeader.SortOrder.coordinate ) {
while ( iterator.hasNext() ) {
SAMRecord additionalRead = iterator.next();
// Stop filling the shard as soon as we encounter a read having a different
// alignment start or contig from the last read added in the earlier loop
// above, or an unmapped read
if ( read == null ||
additionalRead.getReadUnmappedFlag() ||
! additionalRead.getReferenceIndex().equals(read.getReferenceIndex()) ||
additionalRead.getAlignmentStart() != read.getAlignmentStart() ) {
break;
}
shard.addRead(additionalRead);
noteFilePositionUpdate(positionUpdates, additionalRead);
}
}
// If the reads are sorted in queryname order, ensure that all reads
@ -585,6 +622,7 @@ public class SAMDataSource {
iterator = new MalformedBAMErrorReformatingIterator(id.samFile, iterator);
if(shard.getGenomeLocs().size() > 0)
iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs());
iteratorMap.put(readers.getReader(id), iterator);
}
@ -597,10 +635,7 @@ public class SAMDataSource {
readProperties.getDownsamplingMethod().toFraction,
readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
readProperties.getSupplementalFilters(),
readProperties.getBAQCalculationMode(),
readProperties.getBAQQualityMode(),
readProperties.getRefReader(),
readProperties.getBQSRApplier(),
readProperties.getReadTransformers(),
readProperties.defaultBaseQualities());
}
@ -667,40 +702,62 @@ public class SAMDataSource {
Double downsamplingFraction,
Boolean noValidationOfReadOrder,
Collection<ReadFilter> supplementalFilters,
BAQ.CalculationMode cmode,
BAQ.QualityMode qmode,
IndexedFastaSequenceFile refReader,
BaseRecalibration bqsrApplier,
List<ReadTransformer> readTransformers,
byte defaultBaseQualities) {
// *********************************************************************************** //
// * NOTE: ALL FILTERING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * //
// * (otherwise we will process something that we may end up throwing away) * //
// *********************************************************************************** //
// ************************************************************************************************ //
// * NOTE: ALL FILTERING/DOWNSAMPLING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * //
// * (otherwise we will process something that we may end up throwing away) * //
// ************************************************************************************************ //
if (downsamplingFraction != null)
wrappedIterator = new DownsampleIterator(wrappedIterator, downsamplingFraction);
wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters));
if ( readProperties.getDownsamplingMethod().useExperimentalDownsampling ) {
wrappedIterator = applyDownsamplingIterator(wrappedIterator);
}
// Use the old fractional downsampler only if we're not using experimental downsampling:
if ( ! readProperties.getDownsamplingMethod().useExperimentalDownsampling && downsamplingFraction != null )
wrappedIterator = new LegacyDownsampleIterator(wrappedIterator, downsamplingFraction);
// unless they've said not to validate read ordering (!noValidationOfReadOrder) and we've enabled verification,
// verify the read ordering by applying a sort order iterator
if (!noValidationOfReadOrder && enableVerification)
wrappedIterator = new VerifyingSamIterator(genomeLocParser,wrappedIterator);
wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters));
wrappedIterator = new VerifyingSamIterator(wrappedIterator);
if (useOriginalBaseQualities || defaultBaseQualities >= 0)
// only wrap if we are replacing the original qualities or using a default base quality
wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities);
if (bqsrApplier != null)
wrappedIterator = new BQSRSamIterator(wrappedIterator, bqsrApplier);
if (cmode != BAQ.CalculationMode.OFF)
wrappedIterator = new BAQSamIterator(refReader, wrappedIterator, cmode, qmode);
// set up read transformers
for ( final ReadTransformer readTransformer : readTransformers ) {
if ( readTransformer.enabled() && readTransformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_INPUT )
wrappedIterator = new ReadTransformingIterator(wrappedIterator, readTransformer);
}
return wrappedIterator;
}
protected StingSAMIterator applyDownsamplingIterator( StingSAMIterator wrappedIterator ) {
if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) {
ReadsDownsamplerFactory<SAMRecord> downsamplerFactory = readProperties.getDownsamplingMethod().toCoverage != null ?
new SimplePositionalDownsamplerFactory<SAMRecord>(readProperties.getDownsamplingMethod().toCoverage) :
new FractionalDownsamplerFactory<SAMRecord>(readProperties.getDownsamplingMethod().toFraction);
return new PerSampleDownsamplingReadsIterator(wrappedIterator, downsamplerFactory);
}
else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) {
ReadsDownsampler<SAMRecord> downsampler = readProperties.getDownsamplingMethod().toCoverage != null ?
new SimplePositionalDownsampler<SAMRecord>(readProperties.getDownsamplingMethod().toCoverage) :
new FractionalDownsampler<SAMRecord>(readProperties.getDownsamplingMethod().toFraction);
return new DownsamplingReadsIterator(wrappedIterator, downsampler);
}
return wrappedIterator;
}
private class SAMResourcePool {
/**
* How many entries can be cached in this resource pool?

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.gatk;
package org.broadinstitute.sting.gatk.downsampling;
/**
* Type of downsampling method to invoke.

View File

@ -28,49 +28,92 @@ import java.util.Collection;
import java.util.List;
/**
* The basic downsampler API, with no reads-specific operations
* The basic downsampler API, with no reads-specific operations.
*
* Downsamplers that extend this interface rather than the ReadsDownsampler interface can handle
* any kind of item, however they cannot be wrapped within a DownsamplingReadsIterator or a
* PerSampleDownsamplingReadsIterator.
*
* @author David Roazen
*/
public interface Downsampler<T> {
/*
* Submit one item to the downsampler for consideration . Some downsamplers will be able to determine
/**
* Submit one item to the downsampler for consideration. Some downsamplers will be able to determine
* immediately whether the item survives the downsampling process, while others will need to see
* more items before making that determination.
*
* @param item the individual item to submit to the downsampler for consideration
*/
public void submit( T item );
/*
* Submit a collection of items to the downsampler for consideration.
/**
* Submit a collection of items to the downsampler for consideration. Should be equivalent to calling
* submit() on each individual item in the collection.
*
* @param items the collection of items to submit to the downsampler for consideration
*/
public void submit( Collection<T> items );
/*
/**
* Are there items that have survived the downsampling process waiting to be retrieved?
*
* @return true if this downsampler has > 0 finalized items, otherwise false
*/
public boolean hasDownsampledItems();
public boolean hasFinalizedItems();
/*
* Return (and remove) all items that have survived downsampling and are waiting to be retrieved.
/**
* Return (and *remove*) all items that have survived downsampling and are waiting to be retrieved.
*
* @return a list of all finalized items this downsampler contains, or an empty list if there are none
*/
public List<T> consumeDownsampledItems();
public List<T> consumeFinalizedItems();
/*
/**
* Are there items stored in this downsampler that it doesn't yet know whether they will
* ultimately survive the downsampling process?
*
* @return true if this downsampler has > 0 pending items, otherwise false
*/
public boolean hasPendingItems();
/*
/**
* Peek at the first finalized item stored in this downsampler (or null if there are no finalized items)
*
* @return the first finalized item in this downsampler (the item is not removed from the downsampler by this call),
* or null if there are none
*/
public T peekFinalized();
/**
* Peek at the first pending item stored in this downsampler (or null if there are no pending items)
*
* @return the first pending item stored in this downsampler (the item is not removed from the downsampler by this call),
* or null if there are none
*/
public T peekPending();
/**
* Returns the number of items discarded (so far) during the downsampling process
*
* @return the number of items that have been submitted to this downsampler and discarded in the process of
* downsampling
*/
public int getNumberOfDiscardedItems();
/**
* Used to tell the downsampler that no more items will be submitted to it, and that it should
* finalize any pending items.
*/
public void signalEndOfInput();
/*
* Reset the downsampler to a clean state, devoid of any pending/downsampled items or tracked state
* information.
/**
* Empty the downsampler of all finalized/pending items
*/
public void clear();
/**
* Reset stats in the downsampler such as the number of discarded items *without* clearing the downsampler of items
*/
public void reset();
}

View File

@ -0,0 +1,153 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.exceptions.UserException;
/**
* Describes the method for downsampling reads at a given locus.
*/
public class DownsamplingMethod {
/**
* Type of downsampling to perform.
*/
public final DownsampleType type;
/**
* Actual downsampling target is specified as an integer number of reads.
*/
public final Integer toCoverage;
/**
* Actual downsampling target is specified as a fraction of total available reads.
*/
public final Double toFraction;
/**
* Use the new experimental downsampling?
*/
public final boolean useExperimentalDownsampling;
/**
* Expresses no downsampling applied at all.
*/
public static final DownsamplingMethod NONE = new DownsamplingMethod(DownsampleType.NONE,null,null,false);
/**
* Default type to use if no type is specified
*/
public static DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE;
/**
* Default target coverage for locus-based traversals
*/
public static int DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE = 1000;
public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction, boolean useExperimentalDownsampling ) {
this.type = type != null ? type : DEFAULT_DOWNSAMPLING_TYPE;
this.toCoverage = toCoverage;
this.toFraction = toFraction;
this.useExperimentalDownsampling = useExperimentalDownsampling;
if ( type == DownsampleType.NONE ) {
toCoverage = null;
toFraction = null;
}
validate();
}
private void validate() {
// Can't leave toFraction and toCoverage null unless type is NONE
if ( type != DownsampleType.NONE && toFraction == null && toCoverage == null )
throw new UserException.CommandLineException("Must specify either toFraction or toCoverage when downsampling.");
// Fraction and coverage cannot both be specified.
if ( toFraction != null && toCoverage != null )
throw new UserException.CommandLineException("Downsampling coverage and fraction are both specified. Please choose only one.");
// toCoverage must be > 0 when specified
if ( toCoverage != null && toCoverage <= 0 ) {
throw new UserException.CommandLineException("toCoverage must be > 0 when downsampling to coverage");
}
// toFraction must be >= 0.0 and <= 1.0 when specified
if ( toFraction != null && (toFraction < 0.0 || toFraction > 1.0) ) {
throw new UserException.CommandLineException("toFraction must be >= 0.0 and <= 1.0 when downsampling to a fraction of reads");
}
// Some restrictions only exist for the old downsampling implementation:
if ( ! useExperimentalDownsampling ) {
// By sample downsampling does not work with a fraction of reads in the old downsampling implementation
if( type == DownsampleType.BY_SAMPLE && toFraction != null )
throw new UserException.CommandLineException("Cannot downsample to fraction with the BY_SAMPLE method");
}
// Some restrictions only exist for the new downsampling implementation:
if ( useExperimentalDownsampling ) {
if ( type == DownsampleType.ALL_READS && toCoverage != null ) {
throw new UserException.CommandLineException("Cannot downsample to coverage with the ALL_READS method in the experimental downsampling implementation");
}
}
}
public String toString() {
StringBuilder builder = new StringBuilder("Downsampling Settings: ");
if ( type == DownsampleType.NONE ) {
builder.append("No downsampling");
}
else {
builder.append(String.format("Method: %s ", type));
if ( toCoverage != null ) {
builder.append(String.format("Target Coverage: %d ", toCoverage));
}
else {
builder.append(String.format("Target Fraction: %.2f ", toFraction));
}
if ( useExperimentalDownsampling ) {
builder.append("Using Experimental Downsampling");
}
}
return builder.toString();
}
public static DownsamplingMethod getDefaultDownsamplingMethod( Walker walker, boolean useExperimentalDownsampling ) {
if ( walker instanceof LocusWalker || walker instanceof ActiveRegionWalker ) {
return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE, DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE,
null, useExperimentalDownsampling);
}
else {
return new DownsamplingMethod(DownsampleType.NONE, null, null, useExperimentalDownsampling);
}
}
}

View File

@ -33,7 +33,8 @@ import java.util.NoSuchElementException;
/**
* StingSAMIterator wrapper around our generic reads downsampler interface
* StingSAMIterator wrapper around our generic reads downsampler interface. Converts the push-style
* downsampler interface to a pull model.
*
* @author David Roazen
*/
@ -42,35 +43,50 @@ public class DownsamplingReadsIterator implements StingSAMIterator {
private StingSAMIterator nestedSAMIterator;
private ReadsDownsampler<SAMRecord> downsampler;
private Collection<SAMRecord> downsampledReadsCache;
private Iterator<SAMRecord> downsampledReadsCacheIterator;
private SAMRecord nextRead = null;
private Iterator<SAMRecord> downsampledReadsCacheIterator = null;
/**
* @param iter wrapped iterator from which this iterator will pull reads
* @param downsampler downsampler through which the reads will be fed
*/
public DownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsampler<SAMRecord> downsampler ) {
nestedSAMIterator = iter;
this.downsampler = downsampler;
fillDownsampledReadsCache();
advanceToNextRead();
}
public boolean hasNext() {
if ( downsampledReadsCacheIterator.hasNext() ) {
return true;
}
else if ( ! nestedSAMIterator.hasNext() || ! fillDownsampledReadsCache() ) {
return false;
}
return true;
return nextRead != null;
}
public SAMRecord next() {
if ( ! downsampledReadsCacheIterator.hasNext() && ! fillDownsampledReadsCache() ) {
if ( nextRead == null ) {
throw new NoSuchElementException("next() called when there are no more items");
}
return downsampledReadsCacheIterator.next();
SAMRecord toReturn = nextRead;
advanceToNextRead();
return toReturn;
}
private void advanceToNextRead() {
if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) {
nextRead = null;
}
else {
nextRead = downsampledReadsCacheIterator.next();
}
}
private boolean readyToReleaseReads() {
return downsampledReadsCacheIterator != null && downsampledReadsCacheIterator.hasNext();
}
private boolean fillDownsampledReadsCache() {
while ( nestedSAMIterator.hasNext() && ! downsampler.hasDownsampledItems() ) {
while ( nestedSAMIterator.hasNext() && ! downsampler.hasFinalizedItems() ) {
downsampler.submit(nestedSAMIterator.next());
}
@ -78,7 +94,8 @@ public class DownsamplingReadsIterator implements StingSAMIterator {
downsampler.signalEndOfInput();
}
downsampledReadsCache = downsampler.consumeDownsampledItems();
// use returned collection directly rather than make a copy, for speed
downsampledReadsCache = downsampler.consumeFinalizedItems();
downsampledReadsCacheIterator = downsampledReadsCache.iterator();
return downsampledReadsCacheIterator.hasNext();

View File

@ -33,7 +33,10 @@ import java.util.Collection;
import java.util.List;
/**
* Fractional Downsampler: selects a specified fraction of the reads for inclusion
* Fractional Downsampler: selects a specified fraction of the reads for inclusion.
*
* Since the selection is done randomly, the actual fraction of reads retained may be slightly
* more or less than the requested fraction, depending on the total number of reads submitted.
*
* @author David Roazen
*/
@ -43,8 +46,16 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
private int cutoffForInclusion;
private int numDiscardedItems;
private static final int RANDOM_POOL_SIZE = 10000;
/**
* Construct a FractionalDownsampler
*
* @param fraction Fraction of reads to preserve, between 0.0 (inclusive) and 1.0 (inclusive).
* Actual number of reads preserved may differ randomly.
*/
public FractionalDownsampler( double fraction ) {
if ( fraction < 0.0 || fraction > 1.0 ) {
throw new ReviewedStingException("Fraction of reads to include must be between 0.0 and 1.0, inclusive");
@ -52,12 +63,16 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
cutoffForInclusion = (int)(fraction * RANDOM_POOL_SIZE);
clear();
reset();
}
public void submit( T newRead ) {
if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoffForInclusion ) {
selectedReads.add(newRead);
}
else {
numDiscardedItems++;
}
}
public void submit( Collection<T> newReads ) {
@ -66,11 +81,12 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
}
}
public boolean hasDownsampledItems() {
public boolean hasFinalizedItems() {
return selectedReads.size() > 0;
}
public List<T> consumeDownsampledItems() {
public List<T> consumeFinalizedItems() {
// pass by reference rather than make a copy, for speed
List<T> downsampledItems = selectedReads;
clear();
return downsampledItems;
@ -80,6 +96,18 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
return false;
}
public T peekFinalized() {
return selectedReads.isEmpty() ? null : selectedReads.get(0);
}
public T peekPending() {
return null;
}
public int getNumberOfDiscardedItems() {
return numDiscardedItems;
}
public void signalEndOfInput() {
// NO-OP
}
@ -88,7 +116,15 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
selectedReads = new ArrayList<T>();
}
public void reset() {
numDiscardedItems = 0;
}
public boolean requiresCoordinateSortOrder() {
return false;
}
public void signalNoMoreReadsBefore( T read ) {
// NO-OP
}
}

View File

@ -0,0 +1,45 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
/**
* Factory for creating FractionalDownsamplers on demand
*
* @author David Roazen
*/
public class FractionalDownsamplerFactory<T extends SAMRecord> implements ReadsDownsamplerFactory<T> {
private double fraction;
public FractionalDownsamplerFactory( double fraction ) {
this.fraction = fraction;
}
public ReadsDownsampler<T> newInstance() {
return new FractionalDownsampler<T>(fraction);
}
}

View File

@ -0,0 +1,212 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import org.broadinstitute.sting.utils.MathUtils;
import java.util.*;
/**
* Leveling Downsampler: Given a set of Lists of arbitrary items and a target size, removes items from
* the Lists in an even fashion until the total size of all Lists is <= the target size. Leveling
* does not occur until all Lists have been submitted and signalEndOfInput() is called.
*
* The Lists should be LinkedLists for maximum efficiency during item removal, however other
* kinds of Lists are also accepted (albeit at a slight performance penalty).
*
* Since this downsampler extends the Downsampler interface rather than the ReadsDownsampler interface,
* the Lists need not contain reads. However this downsampler may not be wrapped within one of the
* DownsamplingReadsIterators
*
* @param <T> the List type representing the stacks to be leveled
* @param <E> the type of the elements of each List
*
* @author David Roazen
*/
public class LevelingDownsampler<T extends List<E>, E> implements Downsampler<T> {
private int targetSize;
private List<T> groups;
private boolean groupsAreFinalized;
private int numDiscardedItems;
/**
* Construct a LevelingDownsampler
*
* @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed
* this value -- if it does, items are removed from Lists evenly until the total size
* is <= this value
*/
public LevelingDownsampler( int targetSize ) {
this.targetSize = targetSize;
clear();
reset();
}
public void submit( T item ) {
groups.add(item);
}
public void submit( Collection<T> items ){
groups.addAll(items);
}
public boolean hasFinalizedItems() {
return groupsAreFinalized && groups.size() > 0;
}
public List<T> consumeFinalizedItems() {
if ( ! hasFinalizedItems() ) {
return new ArrayList<T>();
}
// pass by reference rather than make a copy, for speed
List<T> toReturn = groups;
clear();
return toReturn;
}
public boolean hasPendingItems() {
return ! groupsAreFinalized && groups.size() > 0;
}
public T peekFinalized() {
return hasFinalizedItems() ? groups.get(0) : null;
}
public T peekPending() {
return hasPendingItems() ? groups.get(0) : null;
}
public int getNumberOfDiscardedItems() {
return numDiscardedItems;
}
public void signalEndOfInput() {
levelGroups();
groupsAreFinalized = true;
}
public void clear() {
groups = new ArrayList<T>();
groupsAreFinalized = false;
}
public void reset() {
numDiscardedItems = 0;
}
private void levelGroups() {
int totalSize = 0;
int[] groupSizes = new int[groups.size()];
int currentGroupIndex = 0;
for ( T group : groups ) {
groupSizes[currentGroupIndex] = group.size();
totalSize += groupSizes[currentGroupIndex];
currentGroupIndex++;
}
if ( totalSize <= targetSize ) {
return; // no need to eliminate any items
}
// We will try to remove exactly this many items, however we will refuse to allow any
// one group to fall below size 1, and so might end up removing fewer items than this
int numItemsToRemove = totalSize - targetSize;
currentGroupIndex = 0;
int numConsecutiveUmodifiableGroups = 0;
// Continue until we've either removed all the items we wanted to, or we can't
// remove any more items without violating the constraint that all groups must
// be left with at least one item
while ( numItemsToRemove > 0 && numConsecutiveUmodifiableGroups < groupSizes.length ) {
if ( groupSizes[currentGroupIndex] > 1 ) {
groupSizes[currentGroupIndex]--;
numItemsToRemove--;
numConsecutiveUmodifiableGroups = 0;
}
else {
numConsecutiveUmodifiableGroups++;
}
currentGroupIndex = (currentGroupIndex + 1) % groupSizes.length;
}
// Now we actually go through and reduce each group to its new count as specified in groupSizes
currentGroupIndex = 0;
for ( T group : groups ) {
downsampleOneGroup(group, groupSizes[currentGroupIndex]);
currentGroupIndex++;
}
}
private void downsampleOneGroup( T group, int numItemsToKeep ) {
if ( numItemsToKeep >= group.size() ) {
return;
}
numDiscardedItems += group.size() - numItemsToKeep;
BitSet itemsToKeep = new BitSet(group.size());
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(group.size(), numItemsToKeep) ) {
itemsToKeep.set(selectedIndex);
}
int currentIndex = 0;
// If our group is a linked list, we can remove the desired items in a single O(n) pass with an iterator
if ( group instanceof LinkedList ) {
Iterator iter = group.iterator();
while ( iter.hasNext() ) {
iter.next();
if ( ! itemsToKeep.get(currentIndex) ) {
iter.remove();
}
currentIndex++;
}
}
// If it's not a linked list, it's more efficient to copy the desired items into a new list and back rather
// than suffer O(n^2) of item shifting
else {
List<E> keptItems = new ArrayList<E>(numItemsToKeep);
for ( E item : group ) {
if ( itemsToKeep.get(currentIndex) ) {
keptItems.add(item);
}
currentIndex++;
}
group.clear();
group.addAll(keptItems);
}
}
}

View File

@ -0,0 +1,202 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMRecordComparator;
import net.sf.samtools.SAMRecordCoordinateComparator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import java.util.*;
/**
* StingSAMIterator wrapper around our generic reads downsampler interface
* that downsamples reads for each sample independently, and then re-assembles
* the reads back into a single merged stream.
*
* @author David Roazen
*/
public class PerSampleDownsamplingReadsIterator implements StingSAMIterator {
private StingSAMIterator nestedSAMIterator;
private ReadsDownsamplerFactory<SAMRecord> downsamplerFactory;
private Map<String, ReadsDownsampler<SAMRecord>> perSampleDownsamplers;
private PriorityQueue<SAMRecord> orderedDownsampledReadsCache;
private SAMRecord nextRead = null;
private SAMRecordComparator readComparator = new SAMRecordCoordinateComparator();
private SAMRecord earliestPendingRead = null;
private ReadsDownsampler<SAMRecord> earliestPendingDownsampler = null;
// Initial size of our cache of finalized reads
private static final int DOWNSAMPLED_READS_INITIAL_CACHE_SIZE = 4096;
// The number of positional changes that can occur in the read stream before all downsamplers
// should be informed of the current position (guards against samples with relatively sparse reads
// getting stuck in a pending state):
private static final int DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL = 3; // TODO: experiment with this value
/**
* @param iter wrapped iterator from which this iterator will pull reads
* @param downsamplerFactory factory used to create new downsamplers as needed
*/
public PerSampleDownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsamplerFactory<SAMRecord> downsamplerFactory ) {
nestedSAMIterator = iter;
this.downsamplerFactory = downsamplerFactory;
perSampleDownsamplers = new HashMap<String, ReadsDownsampler<SAMRecord>>();
orderedDownsampledReadsCache = new PriorityQueue<SAMRecord>(DOWNSAMPLED_READS_INITIAL_CACHE_SIZE, readComparator);
advanceToNextRead();
}
public boolean hasNext() {
return nextRead != null;
}
public SAMRecord next() {
if ( nextRead == null ) {
throw new NoSuchElementException("next() called when there are no more items");
}
SAMRecord toReturn = nextRead;
advanceToNextRead();
return toReturn;
}
private void advanceToNextRead() {
if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) {
nextRead = null;
}
else {
nextRead = orderedDownsampledReadsCache.poll();
}
}
private boolean readyToReleaseReads() {
if ( orderedDownsampledReadsCache.isEmpty() ) {
return false;
}
return earliestPendingRead == null ||
readComparator.compare(orderedDownsampledReadsCache.peek(), earliestPendingRead) <= 0;
}
private void updateEarliestPendingRead( ReadsDownsampler<SAMRecord> currentDownsampler ) {
// If there is no recorded earliest pending read and this downsampler has pending items,
// then this downsampler's first pending item becomes the new earliest pending read:
if ( earliestPendingRead == null && currentDownsampler.hasPendingItems() ) {
earliestPendingRead = currentDownsampler.peekPending();
earliestPendingDownsampler = currentDownsampler;
}
// In all other cases, we only need to update the earliest pending read when the downsampler
// associated with it experiences a change in its pending reads, since by assuming a sorted
// read stream we're assured that each downsampler's earliest pending read will only increase
// in genomic position over time.
//
// TODO: An occasional O(samples) linear search seems like a better option than keeping the downsamplers
// TODO: sorted by earliest pending read, which would cost at least O(total_reads * (samples + log(samples))),
// TODO: but need to verify this empirically.
else if ( currentDownsampler == earliestPendingDownsampler &&
(! currentDownsampler.hasPendingItems() || readComparator.compare(currentDownsampler.peekPending(), earliestPendingRead) != 0) ) {
earliestPendingRead = null;
earliestPendingDownsampler = null;
for ( ReadsDownsampler<SAMRecord> perSampleDownsampler : perSampleDownsamplers.values() ) {
if ( perSampleDownsampler.hasPendingItems() &&
(earliestPendingRead == null || readComparator.compare(perSampleDownsampler.peekPending(), earliestPendingRead) < 0) ) {
earliestPendingRead = perSampleDownsampler.peekPending();
earliestPendingDownsampler = perSampleDownsampler;
}
}
}
}
private boolean fillDownsampledReadsCache() {
SAMRecord prevRead = null;
int numPositionalChanges = 0;
// Continue submitting reads to the per-sample downsamplers until the read at the top of the priority queue
// can be released without violating global sort order
while ( nestedSAMIterator.hasNext() && ! readyToReleaseReads() ) {
SAMRecord read = nestedSAMIterator.next();
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
ReadsDownsampler<SAMRecord> thisSampleDownsampler = perSampleDownsamplers.get(sampleName);
if ( thisSampleDownsampler == null ) {
thisSampleDownsampler = downsamplerFactory.newInstance();
perSampleDownsamplers.put(sampleName, thisSampleDownsampler);
}
thisSampleDownsampler.submit(read);
updateEarliestPendingRead(thisSampleDownsampler);
if ( prevRead != null && prevRead.getAlignmentStart() != read.getAlignmentStart() ) {
numPositionalChanges++;
}
// If the number of times we've changed position exceeds a certain threshold, inform all
// downsamplers of the current position in the read stream. This is to prevent downsamplers
// for samples with sparser reads than others from getting stuck too long in a pending state.
if ( numPositionalChanges > DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL ) {
for ( ReadsDownsampler<SAMRecord> perSampleDownsampler : perSampleDownsamplers.values() ) {
perSampleDownsampler.signalNoMoreReadsBefore(read);
updateEarliestPendingRead(perSampleDownsampler);
}
}
prevRead = read;
}
if ( ! nestedSAMIterator.hasNext() ) {
for ( ReadsDownsampler<SAMRecord> perSampleDownsampler : perSampleDownsamplers.values() ) {
perSampleDownsampler.signalEndOfInput();
}
earliestPendingRead = null;
earliestPendingDownsampler = null;
}
for ( ReadsDownsampler<SAMRecord> perSampleDownsampler : perSampleDownsamplers.values() ) {
if ( perSampleDownsampler.hasFinalizedItems() ) {
orderedDownsampledReadsCache.addAll(perSampleDownsampler.consumeFinalizedItems());
}
}
return readyToReleaseReads();
}
public void remove() {
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
}
public void close() {
nestedSAMIterator.close();
}
public Iterator<SAMRecord> iterator() {
return this;
}
}

View File

@ -1,259 +0,0 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.*;
/**
* Positional Downsampler: When eliminating reads, try to do so evenly based on the alignment start positions
*
* @author David Roazen
*/
public class PositionalDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
private int targetCoverage;
private ReservoirDownsampler<T> reservoir;
private int currentContigIndex;
private int currentAlignmentStart;
private LinkedList<PositionalReadGrouping> pendingReads;
private ArrayList<T> finalizedReads;
public PositionalDownsampler ( int targetCoverage ) {
this.targetCoverage = targetCoverage;
clear();
}
public void submit ( T newRead ) {
if ( readIsPastCurrentPosition(newRead) ) {
updateAndDownsamplePendingReads();
}
reservoir.submit(newRead);
updateCurrentPosition(newRead);
}
public void submit ( Collection<T> newReads ) {
for ( T read : newReads ) {
submit(read);
}
}
public boolean hasDownsampledItems() {
return finalizedReads.size() > 0;
}
public List<T> consumeDownsampledItems() {
List<T> toReturn = finalizedReads;
finalizedReads = new ArrayList<T>();
return toReturn;
}
public boolean hasPendingItems() {
return pendingReads.size() > 0;
}
public void signalEndOfInput() {
updateAndDownsamplePendingReads();
for ( PositionalReadGrouping group : pendingReads ) {
group.finalizeAllActiveReads();
finalizedReads.addAll(group.getFinalizedReads());
}
pendingReads.clear();
}
public void clear() {
reservoir = new ReservoirDownsampler<T>(targetCoverage);
pendingReads = new LinkedList<PositionalReadGrouping>();
finalizedReads = new ArrayList<T>();
}
public boolean requiresCoordinateSortOrder() {
return true;
}
private void updateCurrentPosition ( T read ) {
currentContigIndex = read.getReferenceIndex();
currentAlignmentStart = read.getAlignmentStart();
}
private boolean readIsPastCurrentPosition ( T read ) {
return read.getReferenceIndex() != currentContigIndex || read.getAlignmentStart() > currentAlignmentStart;
}
private void updateAndDownsamplePendingReads() {
finalizeOutOfScopeReads();
List<T> oldLocusReads = reservoir.consumeDownsampledItems();
pendingReads.add(new PositionalReadGrouping(oldLocusReads, currentContigIndex, currentAlignmentStart));
downsampleOverlappingGroups();
}
private void finalizeOutOfScopeReads() {
Iterator<PositionalReadGrouping> iter = pendingReads.iterator();
boolean noPrecedingUnfinalizedGroups = true;
while ( iter.hasNext() ) {
PositionalReadGrouping currentGroup = iter.next();
currentGroup.finalizeActiveReadsBeforePosition(currentContigIndex, currentAlignmentStart);
if ( currentGroup.isFinalized() && noPrecedingUnfinalizedGroups ) {
iter.remove();
finalizedReads.addAll(currentGroup.getFinalizedReads());
}
else {
noPrecedingUnfinalizedGroups = false;
}
}
}
private void downsampleOverlappingGroups() {
int[] groupReadCounts = new int[pendingReads.size()];
int totalCoverage = 0;
int numActiveGroups = 0;
int currentGroup = 0;
for ( PositionalReadGrouping group : pendingReads ) {
groupReadCounts[currentGroup] = group.numActiveReads();
totalCoverage += groupReadCounts[currentGroup];
if ( groupReadCounts[currentGroup] > 0 ) {
numActiveGroups++;
}
currentGroup++;
}
if ( totalCoverage <= targetCoverage ) {
return;
}
int numReadsToRemove = Math.min(totalCoverage - targetCoverage, totalCoverage - numActiveGroups);
currentGroup = 0;
while ( numReadsToRemove > 0 ) {
if ( groupReadCounts[currentGroup] > 1 ) {
groupReadCounts[currentGroup]--;
numReadsToRemove--;
}
currentGroup = (currentGroup + 1) % groupReadCounts.length;
}
currentGroup = 0;
for ( PositionalReadGrouping group : pendingReads ) {
if ( ! group.isFinalized() ) {
group.downsampleActiveReads(groupReadCounts[currentGroup]);
}
currentGroup++;
}
}
private class PositionalReadGrouping {
private List<T> activeReads;
private List<T> finalizedReads;
private int contig;
private int alignmentStart;
public PositionalReadGrouping( Collection<T> reads, int contig, int alignmentStart ) {
activeReads = new LinkedList<T>(reads);
finalizedReads = new ArrayList<T>();
this.contig = contig;
this.alignmentStart = alignmentStart;
}
public int numActiveReads() {
return activeReads.size();
}
public boolean isFinalized() {
return activeReads.size() == 0;
}
public List<T> getFinalizedReads() {
return finalizedReads;
}
public void finalizeActiveReadsBeforePosition( int contig, int position ) {
if ( this.contig != contig ) {
finalizeAllActiveReads();
return;
}
Iterator<T> iter = activeReads.iterator();
while ( iter.hasNext() ) {
T read = iter.next();
if ( read.getAlignmentEnd() < position ) {
iter.remove();
finalizedReads.add(read);
}
}
}
public void finalizeAllActiveReads() {
finalizedReads.addAll(activeReads);
activeReads.clear();
}
public void downsampleActiveReads( int numReadsToKeep ) {
if ( numReadsToKeep > activeReads.size() || numReadsToKeep < 0 ) {
throw new ReviewedStingException(String.format("Cannot retain %d reads out of %d total reads",
numReadsToKeep, activeReads.size()));
}
BitSet itemsToKeep = new BitSet(activeReads.size());
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(activeReads.size(), numReadsToKeep) ) {
itemsToKeep.set(selectedIndex);
}
int currentIndex = 0;
Iterator<T> iter = activeReads.iterator();
while ( iter.hasNext() ) {
T read = iter.next();
if ( ! itemsToKeep.get(currentIndex) ) {
iter.remove();
}
currentIndex++;
}
}
}
}

View File

@ -33,8 +33,23 @@ import net.sf.samtools.SAMRecord;
*/
public interface ReadsDownsampler<T extends SAMRecord> extends Downsampler<T> {
/*
/**
* Does this downsampler require that reads be fed to it in coordinate order?
*
* @return true if reads must be submitted to this downsampler in coordinate order, otherwise false
*/
public boolean requiresCoordinateSortOrder();
/**
* Tell this downsampler that no more reads located before the provided read (according to
* the sort order of the read stream) will be fed to it.
*
* Allows position-aware downsamplers to finalize pending reads earlier than they would
* otherwise be able to, particularly when doing per-sample downsampling and reads for
* certain samples are sparser than average.
*
* @param read the downsampler will assume that no reads located before this read will ever
* be submitted to it in the future
*/
public void signalNoMoreReadsBefore( T read );
}

View File

@ -0,0 +1,37 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
/**
* A ReadsDownsamplerFactory can be used to create an arbitrary number of instances of a particular
* downsampler, all sharing the same construction parameters.
*
* @author David Roazen
*/
public interface ReadsDownsamplerFactory<T extends SAMRecord> {
public ReadsDownsampler<T> newInstance();
}

View File

@ -48,6 +48,14 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
private int totalReadsSeen;
private int numDiscardedItems;
/**
* Construct a ReservoirDownsampler
*
* @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained
* after downsampling will be min(totalReads, targetSampleSize)
*/
public ReservoirDownsampler ( int targetSampleSize ) {
if ( targetSampleSize <= 0 ) {
throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0");
@ -55,6 +63,7 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
this.targetSampleSize = targetSampleSize;
clear();
reset();
}
public void submit ( T newRead ) {
@ -68,6 +77,7 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
if ( randomSlot < targetSampleSize ) {
reservoir.set(randomSlot, newRead);
}
numDiscardedItems++;
}
}
@ -77,11 +87,12 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
}
}
public boolean hasDownsampledItems() {
public boolean hasFinalizedItems() {
return reservoir.size() > 0;
}
public List<T> consumeDownsampledItems() {
public List<T> consumeFinalizedItems() {
// pass by reference rather than make a copy, for speed
List<T> downsampledItems = reservoir;
clear();
return downsampledItems;
@ -91,16 +102,36 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
return false;
}
public T peekFinalized() {
return reservoir.isEmpty() ? null : reservoir.get(0);
}
public T peekPending() {
return null;
}
public int getNumberOfDiscardedItems() {
return numDiscardedItems;
}
public void signalEndOfInput() {
// NO-OP
}
public void clear() {
reservoir = new ArrayList<T>(targetSampleSize);
totalReadsSeen = 0;
totalReadsSeen = 0; // an internal stat used by the downsampling process, so not cleared by reset() below
}
public void reset() {
numDiscardedItems = 0;
}
public boolean requiresCoordinateSortOrder() {
return false;
}
public void signalNoMoreReadsBefore( T read ) {
// NO-OP
}
}

View File

@ -0,0 +1,45 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
/**
* Factory for creating ReservoirDownsamplers on demand
*
* @author David Roazen
*/
public class ReservoirDownsamplerFactory<T extends SAMRecord> implements ReadsDownsamplerFactory<T> {
private int targetSampleSize;
public ReservoirDownsamplerFactory( int targetSampleSize ) {
this.targetSampleSize = targetSampleSize;
}
public ReadsDownsampler<T> newInstance() {
return new ReservoirDownsampler<T>(targetSampleSize);
}
}

View File

@ -0,0 +1,169 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
import java.util.*;
/**
* Simple Positional Downsampler: Downsample each stack of reads at each alignment start to a size <= a target coverage
* using a Reservoir downsampler. Stores only O(target coverage) reads in memory at any given time.
*
* @author David Roazen
*/
public class SimplePositionalDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
private int targetCoverage;
private ReservoirDownsampler<T> reservoir;
private int currentContigIndex;
private int currentAlignmentStart;
private boolean positionEstablished;
private boolean unmappedReadsReached;
private ArrayList<T> finalizedReads;
private int numDiscardedItems;
/**
* Construct a SimplePositionalDownsampler
*
* @param targetCoverage Maximum number of reads that may share any given alignment start position
*/
public SimplePositionalDownsampler( int targetCoverage ) {
this.targetCoverage = targetCoverage;
reservoir = new ReservoirDownsampler<T>(targetCoverage);
finalizedReads = new ArrayList<T>();
clear();
reset();
}
public void submit( T newRead ) {
updatePositionalState(newRead);
if ( unmappedReadsReached ) { // don't downsample the unmapped reads at the end of the stream
finalizedReads.add(newRead);
}
else {
int reservoirPreviouslyDiscardedItems = reservoir.getNumberOfDiscardedItems();
reservoir.submit(newRead);
numDiscardedItems += reservoir.getNumberOfDiscardedItems() - reservoirPreviouslyDiscardedItems;
}
}
public void submit( Collection<T> newReads ) {
for ( T read : newReads ) {
submit(read);
}
}
public boolean hasFinalizedItems() {
return finalizedReads.size() > 0;
}
public List<T> consumeFinalizedItems() {
// pass by reference rather than make a copy, for speed
List<T> toReturn = finalizedReads;
finalizedReads = new ArrayList<T>();
return toReturn;
}
public boolean hasPendingItems() {
return reservoir.hasFinalizedItems();
}
public T peekFinalized() {
return finalizedReads.isEmpty() ? null : finalizedReads.get(0);
}
public T peekPending() {
return reservoir.peekFinalized();
}
public int getNumberOfDiscardedItems() {
return numDiscardedItems;
}
public void signalEndOfInput() {
finalizeReservoir();
}
public void clear() {
reservoir.clear();
reservoir.reset();
finalizedReads.clear();
positionEstablished = false;
unmappedReadsReached = false;
}
public void reset() {
numDiscardedItems = 0;
}
public boolean requiresCoordinateSortOrder() {
return true;
}
public void signalNoMoreReadsBefore( T read ) {
updatePositionalState(read);
}
private void updatePositionalState( T newRead ) {
if ( readIsPastCurrentPosition(newRead) ) {
if ( reservoir.hasFinalizedItems() ) {
finalizeReservoir();
}
setCurrentPosition(newRead);
if ( newRead.getReadUnmappedFlag() ) {
unmappedReadsReached = true;
}
}
}
private void setCurrentPosition( T read ) {
currentContigIndex = read.getReferenceIndex();
currentAlignmentStart = read.getAlignmentStart();
positionEstablished = true;
}
private boolean readIsPastCurrentPosition( T read ) {
return ! positionEstablished ||
read.getReferenceIndex() > currentContigIndex ||
read.getAlignmentStart() > currentAlignmentStart ||
(read.getReadUnmappedFlag() && ! unmappedReadsReached);
}
private void finalizeReservoir() {
finalizedReads.addAll(reservoir.consumeFinalizedItems());
reservoir.reset();
}
}

View File

@ -0,0 +1,45 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
/**
* Factory for creating SimplePositionalDownsamplers on demand
*
* @author David Roazen
*/
public class SimplePositionalDownsamplerFactory<T extends SAMRecord> implements ReadsDownsamplerFactory<T> {
private int targetCoverage;
public SimplePositionalDownsamplerFactory( int targetCoverage ) {
this.targetCoverage = targetCoverage;
}
public ReadsDownsampler<T> newInstance() {
return new SimplePositionalDownsampler<T>(targetCoverage);
}
}

View File

@ -8,9 +8,11 @@ import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.threading.EfficiencyMonitoringThreadFactory;
import org.broadinstitute.sting.utils.threading.ThreadPoolMonitor;
import java.util.Collection;
@ -75,14 +77,27 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
/**
* Create a new hierarchical microscheduler to process the given reads and reference.
*
* @param walker the walker used to process the dataset.
* @param reads Reads file(s) to process.
* @param reference Reference for driving the traversal.
* @param nThreadsToUse maximum number of threads to use to do the work
* @param walker the walker used to process the dataset.
* @param reads Reads file(s) to process.
* @param reference Reference for driving the traversal.
* @param threadAllocation How should we apply multi-threaded execution?
*/
protected HierarchicalMicroScheduler(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods, int nThreadsToUse ) {
super(engine, walker, reads, reference, rods);
this.threadPool = Executors.newFixedThreadPool(nThreadsToUse);
protected HierarchicalMicroScheduler(final GenomeAnalysisEngine engine,
final Walker walker,
final SAMDataSource reads,
final IndexedFastaSequenceFile reference,
final Collection<ReferenceOrderedDataSource> rods,
final ThreadAllocation threadAllocation) {
super(engine, walker, reads, reference, rods, threadAllocation);
final int nThreadsToUse = threadAllocation.getNumDataThreads();
if ( threadAllocation.monitorThreadEfficiency() ) {
final EfficiencyMonitoringThreadFactory monitoringThreadFactory = new EfficiencyMonitoringThreadFactory(nThreadsToUse);
setThreadEfficiencyMonitor(monitoringThreadFactory);
this.threadPool = Executors.newFixedThreadPool(nThreadsToUse, monitoringThreadFactory);
} else {
this.threadPool = Executors.newFixedThreadPool(nThreadsToUse);
}
}
public Object execute( Walker walker, Iterable<Shard> shardStrategy ) {
@ -140,6 +155,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
// do final cleanup operations
outputTracker.close();
cleanup();
executionIsDone();
return result;
}

View File

@ -10,9 +10,11 @@ import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.traversals.TraverseActiveRegions;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor;
import java.util.Collection;
@ -33,8 +35,16 @@ public class LinearMicroScheduler extends MicroScheduler {
* @param reference Reference for driving the traversal.
* @param rods Reference-ordered data.
*/
protected LinearMicroScheduler(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods ) {
super(engine, walker, reads, reference, rods);
protected LinearMicroScheduler(final GenomeAnalysisEngine engine,
final Walker walker,
final SAMDataSource reads,
final IndexedFastaSequenceFile reference,
final Collection<ReferenceOrderedDataSource> rods,
final ThreadAllocation threadAllocation) {
super(engine, walker, reads, reference, rods, threadAllocation);
if ( threadAllocation.monitorThreadEfficiency() )
setThreadEfficiencyMonitor(new ThreadEfficiencyMonitor());
}
/**
@ -49,11 +59,12 @@ public class LinearMicroScheduler extends MicroScheduler {
boolean done = walker.isDone();
int counter = 0;
traversalEngine.startTimersIfNecessary();
for (Shard shard : shardStrategy ) {
if ( done || shard == null ) // we ran out of shards that aren't owned
break;
traversalEngine.startTimersIfNecessary();
if(shard.getShardType() == Shard.ShardType.LOCUS) {
WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(),
getReadIterator(shard), shard.getGenomeLocs(), SampleUtils.getSAMFileSamples(engine));
@ -88,6 +99,7 @@ public class LinearMicroScheduler extends MicroScheduler {
outputTracker.close();
cleanup();
executionIsDone();
return accumulator;
}

View File

@ -39,6 +39,7 @@ import org.broadinstitute.sting.gatk.traversals.*;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor;
import javax.management.JMException;
import javax.management.MBeanServer;
@ -58,6 +59,8 @@ import java.util.Collection;
/** Shards and schedules data in manageable chunks. */
public abstract class MicroScheduler implements MicroSchedulerMBean {
// TODO -- remove me and retire non nano scheduled versions of traversals
private final static boolean USE_NANOSCHEDULER_FOR_EVERYTHING = true;
protected static final Logger logger = Logger.getLogger(MicroScheduler.class);
/**
@ -79,6 +82,13 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
private final MBeanServer mBeanServer;
private final ObjectName mBeanName;
/**
* Threading efficiency monitor for tracking the resource utilization of the GATK
*
* may be null
*/
ThreadEfficiencyMonitor threadEfficiencyMonitor = null;
/**
* MicroScheduler factory function. Create a microscheduler appropriate for reducing the
* selected walker.
@ -92,18 +102,36 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
* @return The best-fit microscheduler.
*/
public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods, ThreadAllocation threadAllocation) {
if (walker instanceof TreeReducible && threadAllocation.getNumCPUThreads() > 1) {
if(walker.isReduceByInterval())
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
if(walker instanceof ReadWalker)
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s is a read walker. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",threadAllocation.getNumCPUThreads()));
return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads());
} else {
if(threadAllocation.getNumCPUThreads() > 1)
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
return new LinearMicroScheduler(engine, walker, reads, reference, rods);
if ( threadAllocation.isRunningInParallelMode() ) {
// TODO -- remove me when we fix running NCT within HMS
if ( threadAllocation.getNumDataThreads() > 1 && threadAllocation.getNumCPUThreadsPerDataThread() > 1)
throw new UserException("Currently the GATK does not support running CPU threads within data threads, " +
"please specify only one of NT and NCT");
logger.info(String.format("Running the GATK in parallel mode with %d CPU thread(s) for each of %d data thread(s)",
threadAllocation.getNumCPUThreadsPerDataThread(), threadAllocation.getNumDataThreads()));
}
if ( threadAllocation.getNumDataThreads() > 1 ) {
if (walker.isReduceByInterval())
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
if ( ! (walker instanceof TreeReducible) ) {
throw badNT("nt", engine, walker);
} else {
return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation);
}
} else {
if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof NanoSchedulable) )
throw badNT("nct", engine, walker);
return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation);
}
}
private static UserException badNT(final String parallelArg, final GenomeAnalysisEngine engine, final Walker walker) {
throw new UserException.BadArgumentValue("nt",
String.format("The analysis %s currently does not support parallel execution with %s. " +
"Please run your analysis without the %s option.", engine.getWalkerName(walker.getClass()), parallelArg, parallelArg));
}
/**
@ -113,17 +141,27 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
* @param reads The reads.
* @param reference The reference.
* @param rods the rods to include in the traversal
* @param threadAllocation the allocation of threads to use in the underlying traversal
*/
protected MicroScheduler(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods) {
protected MicroScheduler(final GenomeAnalysisEngine engine,
final Walker walker,
final SAMDataSource reads,
final IndexedFastaSequenceFile reference,
final Collection<ReferenceOrderedDataSource> rods,
final ThreadAllocation threadAllocation) {
this.engine = engine;
this.reads = reads;
this.reference = reference;
this.rods = rods;
if (walker instanceof ReadWalker) {
traversalEngine = new TraverseReads();
traversalEngine = USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1
? new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread())
: new TraverseReads();
} else if (walker instanceof LocusWalker) {
traversalEngine = new TraverseLoci();
traversalEngine = USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1
? new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread())
: new TraverseLociLinear();
} else if (walker instanceof DuplicateWalker) {
traversalEngine = new TraverseDuplicates();
} else if (walker instanceof ReadPairWalker) {
@ -150,6 +188,24 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
}
}
/**
* Return the ThreadEfficiencyMonitor we are using to track our resource utilization, if there is one
*
* @return the monitor, or null if none is active
*/
public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() {
return threadEfficiencyMonitor;
}
/**
* Inform this Microscheduler to use the efficiency monitor used to create threads in subclasses
*
* @param threadEfficiencyMonitor
*/
public void setThreadEfficiencyMonitor(final ThreadEfficiencyMonitor threadEfficiencyMonitor) {
this.threadEfficiencyMonitor = threadEfficiencyMonitor;
}
/**
* Walks a walker over the given list of intervals.
*
@ -183,6 +239,18 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
traversalEngine.printOnTraversalDone();
}
/**
* Must be called by subclasses when execute is done
*/
protected void executionIsDone() {
// Print out the threading efficiency of this HMS, if state monitoring is enabled
if ( threadEfficiencyMonitor != null ) {
// include the master thread information
threadEfficiencyMonitor.threadIsDone(Thread.currentThread());
threadEfficiencyMonitor.printUsageInformation(logger);
}
}
/**
* Gets the engine that created this microscheduler.
* @return The engine owning this microscheduler.

View File

@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState;
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByStateExperimental;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
@ -81,7 +82,13 @@ public class WindowMaker implements Iterable<WindowMaker.WindowMakerIterator>, I
public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List<GenomeLoc> intervals, Collection<String> sampleNames) {
this.sourceInfo = shard.getReadProperties();
this.readIterator = iterator;
this.sourceIterator = new PeekableIterator<AlignmentContext>(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames));
// Temporary: use the experimental version of LocusIteratorByState if experimental downsampling was requested:
this.sourceIterator = sourceInfo.getDownsamplingMethod().useExperimentalDownsampling ?
new PeekableIterator<AlignmentContext>(new LocusIteratorByStateExperimental(iterator,sourceInfo,genomeLocParser, sampleNames))
:
new PeekableIterator<AlignmentContext>(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames));
this.intervalIterator = intervals.size()>0 ? new PeekableIterator<GenomeLoc>(intervals.iterator()) : null;
}

View File

@ -25,9 +25,14 @@
package org.broadinstitute.sting.gatk.filters;
import com.google.common.base.Function;
import com.google.common.collect.Collections2;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.help.GATKDocUtils;
import java.util.Collection;
import java.util.List;
/**
* Manage filters and filter options. Any requests for basic filtering classes
@ -54,4 +59,39 @@ public class FilterManager extends PluginManager<ReadFilter> {
public Collection<Class<? extends ReadFilter>> getValues() {
return this.getPlugins();
}
/**
* Rather than use the default error message, print out a list of read filters as well.
* @param pluginCategory - string, the category of the plugin (e.g. read filter)
* @param pluginName - string, what we were trying to match (but failed to)
* @return - A wall of text with the default message, followed by a listing of available read filters
*/
@Override
protected String formatErrorMessage(String pluginCategory, String pluginName) {
List<Class<? extends ReadFilter>> availableFilters = this.getPluginsImplementing(ReadFilter.class);
return String.format("Read filter %s not found. Available read filters:%n%n%s%n%n%s",pluginName,
userFriendlyListofReadFilters(availableFilters),
"Please consult the GATK Documentation (http://www.broadinstitute.org/gatk/gatkdocs/) for more information.");
}
private String userFriendlyListofReadFilters(List<Class<? extends ReadFilter>> filters) {
final String headName = "FilterName", headDoc = "Documentation";
int longestNameLength = -1;
for ( Class < ? extends ReadFilter> filter : filters ) {
longestNameLength = Math.max(longestNameLength,this.getName(filter).length());
}
String format = " %"+longestNameLength+"s %s%n";
StringBuilder listBuilder = new StringBuilder();
listBuilder.append(String.format(format,headName,headDoc));
for ( Class<? extends ReadFilter> filter : filters ) {
String helpLink = GATKDocUtils.helpLinksToGATKDocs(filter);
String filterName = this.getName(filter);
listBuilder.append(String.format(format,filterName,helpLink));
}
return listBuilder.toString();
}
}

View File

@ -31,12 +31,16 @@ import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.io.File;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
/**
* A stub for routing and management of SAM file reading and writing.
@ -116,15 +120,15 @@ public class SAMFileWriterStub implements Stub<SAMFileWriter>, StingSAMFileWrite
*/
private boolean simplifyBAM = false;
private List<ReadTransformer> onOutputReadTransformers = null;
/**
* Create a new stub given the requested SAM file and compression level.
* @param engine source of header data, maybe other data about input files.
* @param samFile SAM file to (ultimately) create.
*/
public SAMFileWriterStub( GenomeAnalysisEngine engine, File samFile ) {
this.engine = engine;
this.samFile = samFile;
this.samOutputStream = null;
this(engine, samFile, null);
}
/**
@ -133,8 +137,12 @@ public class SAMFileWriterStub implements Stub<SAMFileWriter>, StingSAMFileWrite
* @param stream Output stream to which data should be written.
*/
public SAMFileWriterStub( GenomeAnalysisEngine engine, OutputStream stream ) {
this(engine, null, stream);
}
private SAMFileWriterStub(final GenomeAnalysisEngine engine, final File samFile, final OutputStream stream) {
this.engine = engine;
this.samFile = null;
this.samFile = samFile;
this.samOutputStream = stream;
}
@ -274,17 +282,29 @@ public class SAMFileWriterStub implements Stub<SAMFileWriter>, StingSAMFileWrite
this.headerOverride = header;
}
private void initializeReadTransformers() {
this.onOutputReadTransformers = new ArrayList<ReadTransformer>(engine.getReadTransformers().size());
for ( final ReadTransformer transformer : engine.getReadTransformers() ) {
if ( transformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_OUTPUT )
onOutputReadTransformers.add(transformer);
}
}
/**
* @{inheritDoc}
*/
public void addAlignment( SAMRecord alignment ) {
if ( engine.getArguments().BAQMode != BAQ.CalculationMode.OFF && engine.getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_OUTPUT ) {
//System.out.printf("Writing BAQ at OUTPUT TIME%n");
baqHMM.baqRead(alignment, engine.getReferenceDataSource().getReference(), engine.getArguments().BAQMode, engine.getWalkerBAQQualityMode());
}
public void addAlignment( final SAMRecord readIn ) {
if ( onOutputReadTransformers == null )
initializeReadTransformers();
GATKSAMRecord workingRead = (GATKSAMRecord)readIn;
// run on output read transformers
for ( final ReadTransformer transform : onOutputReadTransformers )
workingRead = transform.apply(workingRead);
writeStarted = true;
outputTracker.getStorage(this).addAlignment(alignment);
outputTracker.getStorage(this).addAlignment(workingRead);
}
/**

View File

@ -32,9 +32,9 @@ import org.broadinstitute.sting.utils.classloader.JVMUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.writer.Options;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;
import java.io.File;
@ -269,7 +269,7 @@ public class VariantContextWriterStub implements Stub<VariantContextWriter>, Var
* @return
*/
public boolean alsoWriteBCFForTest() {
return engine.getArguments().numberOfThreads == 1 && // only works single threaded
return engine.getArguments().numberOfDataThreads == 1 && // only works single threaded
! isCompressed() && // for non-compressed outputs
getFile() != null && // that are going to disk
engine.getArguments().generateShadowBCF; // and we actually want to do it

View File

@ -6,13 +6,13 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import java.util.Iterator;
public class DownsampleIterator implements StingSAMIterator {
public class LegacyDownsampleIterator implements StingSAMIterator {
StingSAMIterator it;
int cutoff;
SAMRecord next;
public DownsampleIterator(StingSAMIterator it, double fraction) {
public LegacyDownsampleIterator(StingSAMIterator it, double fraction) {
this.it = it;
cutoff = (int)(fraction * 10000);
next = getNextRecord();

View File

@ -31,8 +31,8 @@ import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.DownsamplingMethod;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.GenomeLoc;
@ -318,6 +318,7 @@ public class LocusIteratorByState extends LocusIterator {
continue;
if (op == CigarOperator.D) {
// TODO -- LIBS is totally busted for deletions so that reads with Ds right before Is in their CIGAR are broken; must fix
if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1));
size++;

View File

@ -0,0 +1,649 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.iterators;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.Downsampler;
import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.*;
/**
* Iterator that traverses a SAM File, accumulating information on a per-locus basis
*/
public class LocusIteratorByStateExperimental extends LocusIterator {
/**
* our log, which we want to capture anything from this class
*/
private static Logger logger = Logger.getLogger(LocusIteratorByState.class);
// -----------------------------------------------------------------------------------------------------------------
//
// member fields
//
// -----------------------------------------------------------------------------------------------------------------
/**
* Used to create new GenomeLocs.
*/
private final GenomeLocParser genomeLocParser;
private final ArrayList<String> samples;
private final ReadStateManager readStates;
protected static class SAMRecordState {
SAMRecord read;
int readOffset = -1; // how far are we offset from the start of the read bases?
int genomeOffset = -1; // how far are we offset from the alignment start on the genome?
Cigar cigar = null;
int cigarOffset = -1;
CigarElement curElement = null;
int nCigarElements = 0;
int cigarElementCounter = -1; // how far are we into a single cigarElement
// The logical model for generating extended events is as follows: the "record state" implements the traversal
// along the reference; thus stepForwardOnGenome() returns on every and only on actual reference bases. This
// can be a (mis)match or a deletion (in the latter case, we still return on every individual reference base the
// deletion spans). In the extended events mode, the record state also remembers if there was an insertion, or
// if the deletion just started *right before* the current reference base the record state is pointing to upon the return from
// stepForwardOnGenome(). The next call to stepForwardOnGenome() will clear that memory (as we remember only extended
// events immediately preceding the current reference base).
public SAMRecordState(SAMRecord read) {
this.read = read;
cigar = read.getCigar();
nCigarElements = cigar.numCigarElements();
//System.out.printf("Creating a SAMRecordState: %s%n", this);
}
public SAMRecord getRead() {
return read;
}
/**
* What is our current offset in the read's bases that aligns us with the reference genome?
*
* @return
*/
public int getReadOffset() {
return readOffset;
}
/**
* What is the current offset w.r.t. the alignment state that aligns us to the readOffset?
*
* @return
*/
public int getGenomeOffset() {
return genomeOffset;
}
public int getGenomePosition() {
return read.getAlignmentStart() + getGenomeOffset();
}
public GenomeLoc getLocation(GenomeLocParser genomeLocParser) {
return genomeLocParser.createGenomeLoc(read.getReferenceName(), getGenomePosition());
}
public CigarOperator getCurrentCigarOperator() {
return curElement.getOperator();
}
public String toString() {
return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement);
}
public CigarElement peekForwardOnGenome() {
return ( cigarElementCounter + 1 > curElement.getLength() && cigarOffset + 1 < nCigarElements ? cigar.getCigarElement(cigarOffset + 1) : curElement );
}
public CigarElement peekBackwardOnGenome() {
return ( cigarElementCounter - 1 == 0 && cigarOffset - 1 > 0 ? cigar.getCigarElement(cigarOffset - 1) : curElement );
}
public CigarOperator stepForwardOnGenome() {
// we enter this method with readOffset = index of the last processed base on the read
// (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion
if (curElement == null || ++cigarElementCounter > curElement.getLength()) {
cigarOffset++;
if (cigarOffset < nCigarElements) {
curElement = cigar.getCigarElement(cigarOffset);
cigarElementCounter = 0;
// next line: guards against cigar elements of length 0; when new cigar element is retrieved,
// we reenter in order to re-check cigarElementCounter against curElement's length
return stepForwardOnGenome();
} else {
if (curElement != null && curElement.getOperator() == CigarOperator.D)
throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");
// Reads that contain indels model the genomeOffset as the following base in the reference. Because
// we fall into this else block only when indels end the read, increment genomeOffset such that the
// current offset of this read is the next ref base after the end of the indel. This position will
// model a point on the reference somewhere after the end of the read.
genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here:
// we do step forward on the ref, and by returning null we also indicate that we are past the read end.
return null;
}
}
boolean done = false;
switch (curElement.getOperator()) {
case H: // ignore hard clips
case P: // ignore pads
cigarElementCounter = curElement.getLength();
break;
case I: // insertion w.r.t. the reference
case S: // soft clip
cigarElementCounter = curElement.getLength();
readOffset += curElement.getLength();
break;
case D: // deletion w.r.t. the reference
if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string
throw new UserException.MalformedBAM(read, "read starts with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");
// should be the same as N case
genomeOffset++;
done = true;
break;
case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
genomeOffset++;
done = true;
break;
case M:
case EQ:
case X:
readOffset++;
genomeOffset++;
done = true;
break;
default:
throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator());
}
return done ? curElement.getOperator() : stepForwardOnGenome();
}
}
//final boolean DEBUG = false;
//final boolean DEBUG2 = false && DEBUG;
private ReadProperties readInfo;
private AlignmentContext nextAlignmentContext;
private boolean performLevelingDownsampling;
// -----------------------------------------------------------------------------------------------------------------
//
// constructors and other basic operations
//
// -----------------------------------------------------------------------------------------------------------------
public LocusIteratorByStateExperimental(final Iterator<SAMRecord> samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection<String> samples) {
this.readInfo = readInformation;
this.genomeLocParser = genomeLocParser;
this.samples = new ArrayList<String>(samples);
this.readStates = new ReadStateManager(samIterator);
this.performLevelingDownsampling = readInfo.getDownsamplingMethod() != null &&
readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE &&
readInfo.getDownsamplingMethod().toCoverage != null;
// currently the GATK expects this LocusIteratorByState to accept empty sample lists, when
// there's no read data. So we need to throw this error only when samIterator.hasNext() is true
if (this.samples.isEmpty() && samIterator.hasNext()) {
throw new IllegalArgumentException("samples list must not be empty");
}
}
/**
* For testing only. Assumes that the incoming SAMRecords have no read groups, so creates a dummy sample list
* for the system.
*/
public final static Collection<String> sampleListForSAMWithoutReadGroups() {
List<String> samples = new ArrayList<String>();
samples.add(null);
return samples;
}
public Iterator<AlignmentContext> iterator() {
return this;
}
public void close() {
//this.it.close();
}
public boolean hasNext() {
lazyLoadNextAlignmentContext();
return (nextAlignmentContext != null);
//if ( DEBUG ) System.out.printf("hasNext() = %b%n", r);
}
private GenomeLoc getLocation() {
return readStates.isEmpty() ? null : readStates.getFirst().getLocation(genomeLocParser);
}
// -----------------------------------------------------------------------------------------------------------------
//
// next() routine and associated collection operations
//
// -----------------------------------------------------------------------------------------------------------------
public AlignmentContext next() {
lazyLoadNextAlignmentContext();
if (!hasNext())
throw new NoSuchElementException("LocusIteratorByState: out of elements.");
AlignmentContext currentAlignmentContext = nextAlignmentContext;
nextAlignmentContext = null;
return currentAlignmentContext;
}
/**
* Creates the next alignment context from the given state. Note that this is implemented as a lazy load method.
* nextAlignmentContext MUST BE null in order for this method to advance to the next entry.
*/
private void lazyLoadNextAlignmentContext() {
while (nextAlignmentContext == null && readStates.hasNext()) {
readStates.collectPendingReads();
final GenomeLoc location = getLocation();
final Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>();
// TODO: How can you determine here whether the current pileup has been downsampled?
boolean hasBeenSampled = false;
for (final String sample : samples) {
final Iterator<SAMRecordState> iterator = readStates.iterator(sample);
final List<PileupElement> pile = new ArrayList<PileupElement>(readStates.size(sample));
int size = 0; // number of elements in this sample's pileup
int nDeletions = 0; // number of deletions in this sample's pileup
int nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0)
while (iterator.hasNext()) {
final SAMRecordState state = iterator.next(); // state object with the read/offset information
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element
final CigarElement lastElement = state.peekBackwardOnGenome(); // last cigar element
final boolean isSingleElementCigar = nextElement == lastElement;
final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator
final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator
int readOffset = state.getReadOffset(); // the base offset on this read
final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION;
final boolean isAfterDeletion = lastOp == CigarOperator.DELETION;
final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION;
final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION && !isSingleElementCigar;
final boolean isNextToSoftClip = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart());
int nextElementLength = nextElement.getLength();
if (op == CigarOperator.N) // N's are never added to any pileup
continue;
if (op == CigarOperator.D) {
// TODO -- LIBS is totally busted for deletions so that reads with Ds right before Is in their CIGAR are broken; must fix
if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1));
size++;
nDeletions++;
if (read.getMappingQuality() == 0)
nMQ0Reads++;
}
}
else {
if (!filterBaseInRead(read, location.getStart())) {
String insertedBaseString = null;
if (nextOp == CigarOperator.I) {
final int insertionOffset = isSingleElementCigar ? 0 : 1;
// TODO -- someone please implement a better fix for the single element insertion CIGAR!
if (isSingleElementCigar)
readOffset -= (nextElement.getLength() - 1); // LIBS has passed over the insertion bases!
insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + insertionOffset, readOffset + insertionOffset + nextElement.getLength()));
}
pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength));
size++;
if (read.getMappingQuality() == 0)
nMQ0Reads++;
}
}
}
if (pile.size() != 0) // if this pileup added at least one base, add it to the full pileup
fullPileup.put(sample, new ReadBackedPileupImpl(location, pile, size, nDeletions, nMQ0Reads));
}
updateReadStates(); // critical - must be called after we get the current state offsets and location
if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done
nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled);
}
}
// fast testing of position
private boolean readIsPastCurrentPosition(SAMRecord read) {
if (readStates.isEmpty())
return false;
else {
SAMRecordState state = readStates.getFirst();
SAMRecord ourRead = state.getRead();
return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() > state.getGenomePosition();
}
}
/**
* Generic place to put per-base filters appropriate to LocusIteratorByState
*
* @param rec
* @param pos
* @return
*/
private static boolean filterBaseInRead(GATKSAMRecord rec, long pos) {
return ReadUtils.isBaseInsideAdaptor(rec, pos);
}
private void updateReadStates() {
for (final String sample : samples) {
Iterator<SAMRecordState> it = readStates.iterator(sample);
while (it.hasNext()) {
SAMRecordState state = it.next();
CigarOperator op = state.stepForwardOnGenome();
if (op == null) {
// we discard the read only when we are past its end AND indel at the end of the read (if any) was
// already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe
// as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag.
it.remove(); // we've stepped off the end of the object
}
}
}
}
public void remove() {
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
}
protected class ReadStateManager {
private final PeekableIterator<SAMRecord> iterator;
private final SamplePartitioner samplePartitioner;
private final Map<String, PerSampleReadStateManager> readStatesBySample = new HashMap<String, PerSampleReadStateManager>();
private int totalReadStates = 0;
public ReadStateManager(Iterator<SAMRecord> source) {
this.iterator = new PeekableIterator<SAMRecord>(source);
for (final String sample : samples) {
readStatesBySample.put(sample, new PerSampleReadStateManager());
}
samplePartitioner = new SamplePartitioner();
}
/**
* Returns a iterator over all the reads associated with the given sample. Note that remove() is implemented
* for this iterator; if present, total read states will be decremented.
*
* @param sample The sample.
* @return Iterator over the reads associated with that sample.
*/
public Iterator<SAMRecordState> iterator(final String sample) {
return new Iterator<SAMRecordState>() {
private Iterator<SAMRecordState> wrappedIterator = readStatesBySample.get(sample).iterator();
public boolean hasNext() {
return wrappedIterator.hasNext();
}
public SAMRecordState next() {
return wrappedIterator.next();
}
public void remove() {
wrappedIterator.remove();
}
};
}
public boolean isEmpty() {
return totalReadStates == 0;
}
/**
* Retrieves the total number of reads in the manager across all samples.
*
* @return Total number of reads over all samples.
*/
public int size() {
return totalReadStates;
}
/**
* Retrieves the total number of reads in the manager in the given sample.
*
* @param sample The sample.
* @return Total number of reads in the given sample.
*/
public int size(final String sample) {
return readStatesBySample.get(sample).size();
}
public SAMRecordState getFirst() {
for (final String sample : samples) {
PerSampleReadStateManager reads = readStatesBySample.get(sample);
if (!reads.isEmpty())
return reads.peek();
}
return null;
}
public boolean hasNext() {
return totalReadStates > 0 || iterator.hasNext();
}
public void collectPendingReads() {
if (!iterator.hasNext())
return;
if (readStates.size() == 0) {
int firstContigIndex = iterator.peek().getReferenceIndex();
int firstAlignmentStart = iterator.peek().getAlignmentStart();
while (iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) {
samplePartitioner.submitRead(iterator.next());
}
} else {
// Fast fail in the case that the read is past the current position.
if (readIsPastCurrentPosition(iterator.peek()))
return;
while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) {
samplePartitioner.submitRead(iterator.next());
}
}
for (final String sample : samples) {
Collection<SAMRecord> newReads = samplePartitioner.getReadsForSample(sample);
PerSampleReadStateManager statesBySample = readStatesBySample.get(sample);
addReadsToSample(statesBySample, newReads);
}
samplePartitioner.reset();
}
/**
* Add reads with the given sample name to the given hanger entry.
*
* @param readStates The list of read states to add this collection of reads.
* @param reads Reads to add. Selected reads will be pulled from this source.
*/
private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection<SAMRecord> reads) {
if (reads.isEmpty())
return;
Collection<SAMRecordState> newReadStates = new LinkedList<SAMRecordState>();
for (SAMRecord read : reads) {
SAMRecordState state = new SAMRecordState(read);
state.stepForwardOnGenome();
newReadStates.add(state);
}
readStates.addStatesAtNextAlignmentStart(newReadStates);
}
protected class PerSampleReadStateManager implements Iterable<SAMRecordState> {
private List<LinkedList<SAMRecordState>> readStatesByAlignmentStart = new LinkedList<LinkedList<SAMRecordState>>();
private int thisSampleReadStates = 0;
private Downsampler<LinkedList<SAMRecordState>> levelingDownsampler =
performLevelingDownsampling ?
new LevelingDownsampler<LinkedList<SAMRecordState>, SAMRecordState>(readInfo.getDownsamplingMethod().toCoverage) :
null;
public void addStatesAtNextAlignmentStart(Collection<SAMRecordState> states) {
if ( states.isEmpty() ) {
return;
}
readStatesByAlignmentStart.add(new LinkedList<SAMRecordState>(states));
thisSampleReadStates += states.size();
totalReadStates += states.size();
if ( levelingDownsampler != null ) {
levelingDownsampler.submit(readStatesByAlignmentStart);
levelingDownsampler.signalEndOfInput();
thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
// use returned List directly rather than make a copy, for efficiency's sake
readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems();
levelingDownsampler.reset();
}
}
public boolean isEmpty() {
return readStatesByAlignmentStart.isEmpty();
}
public SAMRecordState peek() {
return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek();
}
public int size() {
return thisSampleReadStates;
}
public Iterator<SAMRecordState> iterator() {
return new Iterator<SAMRecordState>() {
private Iterator<LinkedList<SAMRecordState>> alignmentStartIterator = readStatesByAlignmentStart.iterator();
private LinkedList<SAMRecordState> currentPositionReadStates = null;
private Iterator<SAMRecordState> currentPositionReadStatesIterator = null;
public boolean hasNext() {
return alignmentStartIterator.hasNext() ||
(currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext());
}
public SAMRecordState next() {
if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) {
currentPositionReadStates = alignmentStartIterator.next();
currentPositionReadStatesIterator = currentPositionReadStates.iterator();
}
return currentPositionReadStatesIterator.next();
}
public void remove() {
currentPositionReadStatesIterator.remove();
thisSampleReadStates--;
totalReadStates--;
if ( currentPositionReadStates.isEmpty() ) {
alignmentStartIterator.remove();
}
}
};
}
}
}
/**
* Note: stores reads by sample ID string, not by sample object
*/
private class SamplePartitioner {
private Map<String, Collection<SAMRecord>> readsBySample;
private long readsSeen = 0;
public SamplePartitioner() {
readsBySample = new HashMap<String, Collection<SAMRecord>>();
for ( String sample : samples ) {
readsBySample.put(sample, new ArrayList<SAMRecord>());
}
}
public void submitRead(SAMRecord read) {
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
if (readsBySample.containsKey(sampleName))
readsBySample.get(sampleName).add(read);
readsSeen++;
}
public long getNumReadsSeen() {
return readsSeen;
}
public Collection<SAMRecord> getReadsForSample(String sampleName) {
if ( ! readsBySample.containsKey(sampleName) )
throw new NoSuchElementException("Sample name not found");
return readsBySample.get(sampleName);
}
public void reset() {
for ( Collection<SAMRecord> perSampleReads : readsBySample.values() )
perSampleReads.clear();
readsSeen = 0;
}
}
}

View File

@ -0,0 +1,144 @@
package org.broadinstitute.sting.gatk.iterators;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
/**
* Baseclass used to describe a read transformer like BAQ and BQSR
*
* Read transformers are plugable infrastructure that modify read state
* either on input, on output, or within walkers themselves.
*
* The function apply() is called on each read seen by the GATK (after passing
* all ReadFilters) and it can do as it sees fit (without modifying the alignment)
* to the read to change qualities, add tags, etc.
*
* Initialize is called once right before the GATK traversal begins providing
* the ReadTransformer with the ability to collect and initialize data from the
* engine.
*
* Note that all ReadTransformers within the classpath are created and initialized. If one
* shouldn't be run it should look at the command line options of the engine and override
* the enabled.
*
* @since 8/31/12
* @author depristo
*/
abstract public class ReadTransformer {
/**
* When should this read transform be applied?
*/
private ApplicationTime applicationTime;
/**
* Keep track of whether we've been initialized already, and ensure it's not called more than once.
*/
private boolean initialized = false;
protected ReadTransformer() {}
/**
* Master initialization routine. Called to setup a ReadTransform, using it's overloaded initialialSub routine.
*
* @param overrideTime if not null, we will run this ReadTransform at the time provided, regardless of the timing of this read transformer itself
* @param engine the engine, for initializing values
* @param walker the walker we intend to run
*/
@Requires({"initialized == false", "engine != null", "walker != null"})
@Ensures("initialized == true")
public final void initialize(final ApplicationTime overrideTime, final GenomeAnalysisEngine engine, final Walker walker) {
if ( engine == null ) throw new IllegalArgumentException("engine cannot be null");
if ( walker == null ) throw new IllegalArgumentException("walker cannot be null");
this.applicationTime = initializeSub(engine, walker);
if ( overrideTime != null ) this.applicationTime = overrideTime;
initialized = true;
}
/**
* Subclasses must override this to initialize themeselves
*
* @param engine the engine, for initializing values
* @param walker the walker we intend to run
* @return the point of time we'd like this read transform to be run
*/
@Requires({"engine != null", "walker != null"})
@Ensures("result != null")
protected abstract ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker);
/**
* Should this ReadTransformer be activated? Called after initialize, which allows this
* read transformer to look at its arguments and decide if it should be active. All
* ReadTransformers must override this, as by default they are not enabled.
*
* @return true if this ReadTransformer should be used on the read stream
*/
public boolean enabled() {
return false;
}
/**
* Has this transformer been initialized?
*
* @return true if it has
*/
public final boolean isInitialized() {
return initialized;
}
/**
* When should we apply this read transformer?
*
* @return true if yes
*/
public final ApplicationTime getApplicationTime() {
return applicationTime;
}
/**
* Primary interface function for a read transform to actually do some work
*
* The function apply() is called on each read seen by the GATK (after passing
* all ReadFilters) and it can do as it sees fit (without modifying the alignment)
* to the read to change qualities, add tags, etc.
*
* @param read the read to transform
* @return the transformed read
*/
@Requires("read != null")
@Ensures("result != null")
abstract public GATKSAMRecord apply(final GATKSAMRecord read);
@Override
public String toString() {
return getClass().getSimpleName();
}
/**
* When should a read transformer be applied?
*/
public static enum ApplicationTime {
/**
* Walker does not tolerate this read transformer
*/
FORBIDDEN,
/**
* apply the transformation to the incoming reads, the default
*/
ON_INPUT,
/**
* apply the transformation to the outgoing read stream
*/
ON_OUTPUT,
/**
* the walker will deal with the calculation itself
*/
HANDLED_IN_WALKER
}
}

View File

@ -0,0 +1,28 @@
package org.broadinstitute.sting.gatk.iterators;
import java.lang.annotation.*;
/**
* User: hanna
* Date: May 14, 2009
* Time: 1:51:22 PM
* BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT
* Software and documentation are copyright 2005 by the Broad Institute.
* All rights are reserved.
*
* Users acknowledge that this software is supplied without any warranty or support.
* The Broad Institute is not responsible for its use, misuse, or
* functionality.
*/
/**
* Allows the walker to indicate what type of data it wants to consume.
*/
@Documented
@Inherited
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.TYPE)
public @interface ReadTransformersMode {
public abstract ReadTransformer.ApplicationTime ApplicationTime() default ReadTransformer.ApplicationTime.ON_INPUT;
}

View File

@ -1,7 +1,6 @@
package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -11,13 +10,11 @@ import java.util.Iterator;
* Verifies that the incoming stream of reads is correctly sorted
*/
public class VerifyingSamIterator implements StingSAMIterator {
private GenomeLocParser genomeLocParser;
StingSAMIterator it;
SAMRecord last = null;
boolean checkOrderP = true;
public VerifyingSamIterator(GenomeLocParser genomeLocParser,StingSAMIterator it) {
this.genomeLocParser = genomeLocParser;
public VerifyingSamIterator(StingSAMIterator it) {
this.it = it;
}
@ -48,9 +45,9 @@ public class VerifyingSamIterator implements StingSAMIterator {
if(cur.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX || cur.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START)
throw new UserException.MalformedBAM(last,String.format("read %s has inconsistent mapping information.",cur.format()));
GenomeLoc lastLoc = genomeLocParser.createGenomeLoc( last );
GenomeLoc curLoc = genomeLocParser.createGenomeLoc( cur );
return curLoc.compareTo(lastLoc) == -1;
return (last.getReferenceIndex() > cur.getReferenceIndex()) ||
(last.getReferenceIndex().equals(cur.getReferenceIndex()) &&
last.getAlignmentStart() > cur.getAlignmentStart());
}
}

View File

@ -32,6 +32,7 @@ import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor;
import org.jets3t.service.S3Service;
import org.jets3t.service.S3ServiceException;
import org.jets3t.service.impl.rest.httpclient.RestS3Service;
@ -141,6 +142,21 @@ public class GATKRunReport {
@Element(required = true, name = "tag")
private String tag;
// -----------------------------------------------------------------
// elements related to multi-threading and efficiency
// -----------------------------------------------------------------
@Element(required = true, name = "numThreads")
private int numThreads;
@Element(required = true, name = "percent_time_running")
private String percentTimeRunning;
@Element(required = true, name = "percent_time_waiting")
private String percentTimeWaiting;
@Element(required = true, name = "percent_time_blocking")
private String percentTimeBlocking;
@Element(required = true, name = "percent_time_waiting_for_io")
private String percentTimeWaitingForIO;
public enum PhoneHomeOption {
/** Disable phone home */
NO_ET,
@ -201,12 +217,30 @@ public class GATKRunReport {
// if there was an exception, capture it
this.mException = e == null ? null : new ExceptionToXML(e);
numThreads = engine.getTotalNumberOfThreads();
percentTimeRunning = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.USER_CPU);
percentTimeBlocking = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.BLOCKING);
percentTimeWaiting = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.WAITING);
percentTimeWaitingForIO = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.WAITING_FOR_IO);
}
public String getID() {
return id;
}
/**
* Return a string representing the percent of time the GATK spent in state, if possible. Otherwise return NA
*
* @param engine the GATK engine whose threading efficiency info we will use
* @param state the state whose occupancy we wish to know
* @return a string representation of the percent occupancy of state, or NA is not possible
*/
private String getThreadEfficiencyPercent(final GenomeAnalysisEngine engine, final ThreadEfficiencyMonitor.State state) {
final ThreadEfficiencyMonitor tem = engine.getThreadEfficiencyMonitor();
return tem == null ? "NA" : String.format("%.2f", tem.getStatePercent(state));
}
public void postReport(PhoneHomeOption type) {
logger.debug("Posting report of type " + type);

View File

@ -1,179 +0,0 @@
/*
* Copyright (c) 2010. The Broad Institute
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.refdata;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.datasources.providers.RODMetaDataContainer;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import java.util.*;
/**
* @author aaron
* <p/>
* Class ReadMetaDataTracker
* <p/>
* a read-based meta data tracker
*/
public class ReadMetaDataTracker {
/**
* The parser, used to create new GenomeLocs.
*/
private final GenomeLocParser genomeLocParser;
private final SAMRecord record;
// the buffer of positions and RODs we've stored
private final TreeMap<Integer, RODMetaDataContainer> mapping;
/**
* create a read meta data tracker, given the read and a queue of RODatum positions
*
* @param record the read to create offset from
* @param mapping the mapping of reference ordered datum
*/
public ReadMetaDataTracker(GenomeLocParser genomeLocParser, SAMRecord record, TreeMap<Integer, RODMetaDataContainer> mapping) {
this.genomeLocParser = genomeLocParser;
this.record = record;
this.mapping = mapping;
}
/**
* create an alignment of read position to reference ordered datum
*
* @param record the SAMRecord
* @param queue the queue (as a tree set)
* @param cl the class name, null if not filtered by classname
* @param name the datum track name, null if not filtered by name
*
* @return a mapping from the position in the read to the reference ordered datum
*/
private Map<Integer, Collection<GATKFeature>> createReadAlignment(SAMRecord record, TreeMap<Integer, RODMetaDataContainer> queue, Class cl, String name) {
if (name != null && cl != null) throw new IllegalStateException("Both a class and name cannot be specified");
Map<Integer, Collection<GATKFeature>> ret = new LinkedHashMap<Integer, Collection<GATKFeature>>();
GenomeLoc location = genomeLocParser.createGenomeLoc(record);
int length = record.getReadLength();
for (Integer loc : queue.keySet()) {
Integer position = loc - location.getStart();
if (position >= 0 && position < length) {
Collection<GATKFeature> set;
if (cl != null)
set = queue.get(loc).getSet(cl);
else
set = queue.get(loc).getSet(name);
if (set != null && set.size() > 0)
ret.put(position, set);
}
}
return ret;
}
/**
* create an alignment of read position to reference ordered datum
*
* @return a mapping from the position in the read to the reference ordered datum
*/
private Map<Integer, Collection<GATKFeature>> createGenomeLocAlignment(SAMRecord record, TreeMap<Integer, RODMetaDataContainer> mapping, Class cl, String name) {
Map<Integer, Collection<GATKFeature>> ret = new LinkedHashMap<Integer, Collection<GATKFeature>>();
int start = record.getAlignmentStart();
int stop = record.getAlignmentEnd();
for (Integer location : mapping.keySet()) {
if (location >= start && location <= stop)
if (cl != null)
ret.put(location, mapping.get(location).getSet(cl));
else
ret.put(location, mapping.get(location).getSet(name));
}
return ret;
}
/**
* get the position mapping, from read offset to ROD
*
* @return a mapping of read offset to ROD(s)
*/
public Map<Integer, Collection<GATKFeature>> getReadOffsetMapping() {
return createReadAlignment(record, mapping, null, null);
}
/**
* get the position mapping, from read offset to ROD
*
* @return a mapping of genome loc position to ROD(s)
*/
public Map<Integer, Collection<GATKFeature>> getContigOffsetMapping() {
return createGenomeLocAlignment(record, mapping, null, null);
}
/**
* get the position mapping, from read offset to ROD
*
* @return a mapping of read offset to ROD(s)
*/
public Map<Integer, Collection<GATKFeature>> getReadOffsetMapping(String name) {
return createReadAlignment(record, mapping, null, name);
}
/**
* get the position mapping, from read offset to ROD
*
* @return a mapping of genome loc position to ROD(s)
*/
public Map<Integer, Collection<GATKFeature>> getContigOffsetMapping(String name) {
return createGenomeLocAlignment(record, mapping, null, name);
}
/**
* get the position mapping, from read offset to ROD
*
* @return a mapping of read offset to ROD(s)
*/
public Map<Integer, Collection<GATKFeature>> getReadOffsetMapping(Class cl) {
return createReadAlignment(record, mapping, cl, null);
}
/**
* get the position mapping, from read offset to ROD
*
* @return a mapping of genome loc position to ROD(s)
*/
public Map<Integer, Collection<GATKFeature>> getContigOffsetMapping(Class cl) {
return createGenomeLocAlignment(record, mapping, cl, null);
}
/**
* get the list of all the RODS overlapping this read, without any information about their position
* @return a Collection (no order guaranteed), of all the RODs covering this read
*/
public List<GATKFeature> getAllCoveringRods() {
List<GATKFeature> ret = new ArrayList<GATKFeature>();
for (Map.Entry<Integer, RODMetaDataContainer> entry : mapping.entrySet())
ret.addAll(entry.getValue().getSet());
return ret;
}
}

View File

@ -5,7 +5,6 @@ import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.utils.GenomeLoc;
@ -32,11 +31,10 @@ import java.util.*;
* Time: 3:05:23 PM
*/
public class RefMetaDataTracker {
// TODO: this should be a list, not a map, actually
// TODO: this should be a list, not a bindings, actually
private final static RODRecordList EMPTY_ROD_RECORD_LIST = new RODRecordListImpl("EMPTY");
final Map<String, RODRecordList> map;
final ReferenceContext ref;
final Map<String, RODRecordList> bindings;
final protected static Logger logger = Logger.getLogger(RefMetaDataTracker.class);
// ------------------------------------------------------------------------------------------
@ -48,28 +46,25 @@ public class RefMetaDataTracker {
// ------------------------------------------------------------------------------------------
/**
* Only for testing -- not accesssible in any other context
* Create an tracker with no bindings
*/
public RefMetaDataTracker() {
ref = null;
map = Collections.emptyMap();
bindings = Collections.emptyMap();
}
public RefMetaDataTracker(final Collection<RODRecordList> allBindings, final ReferenceContext ref) {
this.ref = ref;
// set up the map
public RefMetaDataTracker(final Collection<RODRecordList> allBindings) {
// set up the bindings
if ( allBindings.isEmpty() )
map = Collections.emptyMap();
bindings = Collections.emptyMap();
else {
Map<String, RODRecordList> tmap = new HashMap<String, RODRecordList>(allBindings.size());
final Map<String, RODRecordList> tmap = new HashMap<String, RODRecordList>(allBindings.size());
for ( RODRecordList rod : allBindings ) {
if ( rod != null && ! rod.isEmpty() )
tmap.put(canonicalName(rod.getName()), rod);
}
// ensure that no one modifies the map itself
map = Collections.unmodifiableMap(tmap);
// ensure that no one modifies the bindings itself
bindings = Collections.unmodifiableMap(tmap);
}
}
@ -99,7 +94,7 @@ public class RefMetaDataTracker {
@Requires({"type != null"})
@Ensures("result != null")
public <T extends Feature> List<T> getValues(final Class<T> type) {
return addValues(map.keySet(), type, new ArrayList<T>(), null, false, false);
return addValues(bindings.keySet(), type, new ArrayList<T>(), null, false, false);
}
/**
@ -114,7 +109,7 @@ public class RefMetaDataTracker {
@Requires({"type != null", "onlyAtThisLoc != null"})
@Ensures("result != null")
public <T extends Feature> List<T> getValues(final Class<T> type, final GenomeLoc onlyAtThisLoc) {
return addValues(map.keySet(), type, new ArrayList<T>(), onlyAtThisLoc, true, false);
return addValues(bindings.keySet(), type, new ArrayList<T>(), onlyAtThisLoc, true, false);
}
/**
@ -296,7 +291,7 @@ public class RefMetaDataTracker {
*/
@Requires({"rodBinding != null"})
public boolean hasValues(final RodBinding rodBinding) {
return map.containsKey(canonicalName(rodBinding.getName()));
return bindings.containsKey(canonicalName(rodBinding.getName()));
}
/**
@ -306,7 +301,7 @@ public class RefMetaDataTracker {
* @return List of all tracks
*/
public List<RODRecordList> getBoundRodTracks() {
return new ArrayList<RODRecordList>(map.values());
return new ArrayList<RODRecordList>(bindings.values());
}
/**
@ -314,38 +309,30 @@ public class RefMetaDataTracker {
* @return the number of tracks with at least one bound Feature
*/
public int getNTracksWithBoundFeatures() {
return map.size();
return bindings.size();
}
// ------------------------------------------------------------------------------------------
//
//
// old style accessors
//
// TODO -- DELETE ME
//
//
// Protected accessors using strings for unit testing
// ------------------------------------------------------------------------------------------
@Deprecated
public boolean hasValues(final String name) {
return map.containsKey(canonicalName(name));
protected boolean hasValues(final String name) {
return bindings.containsKey(canonicalName(name));
}
@Deprecated
public <T extends Feature> List<T> getValues(final Class<T> type, final String name) {
protected <T extends Feature> List<T> getValues(final Class<T> type, final String name) {
return addValues(name, type, new ArrayList<T>(), getTrackDataByName(name), null, false, false);
}
@Deprecated
public <T extends Feature> List<T> getValues(final Class<T> type, final String name, final GenomeLoc onlyAtThisLoc) {
protected <T extends Feature> List<T> getValues(final Class<T> type, final String name, final GenomeLoc onlyAtThisLoc) {
return addValues(name, type, new ArrayList<T>(), getTrackDataByName(name), onlyAtThisLoc, true, false);
}
@Deprecated
public <T extends Feature> T getFirstValue(final Class<T> type, final String name) {
protected <T extends Feature> T getFirstValue(final Class<T> type, final String name) {
return safeGetFirst(getValues(type, name));
}
@Deprecated
public <T extends Feature> T getFirstValue(final Class<T> type, final String name, final GenomeLoc onlyAtThisLoc) {
protected <T extends Feature> T getFirstValue(final Class<T> type, final String name, final GenomeLoc onlyAtThisLoc) {
return safeGetFirst(getValues(type, name, onlyAtThisLoc));
}
@ -366,7 +353,7 @@ public class RefMetaDataTracker {
* @return
*/
@Requires({"l != null"})
final private <T extends Feature> T safeGetFirst(final List<T> l) {
private <T extends Feature> T safeGetFirst(final List<T> l) {
return l.isEmpty() ? null : l.get(0);
}
@ -435,7 +422,7 @@ public class RefMetaDataTracker {
*/
private RODRecordList getTrackDataByName(final String name) {
final String luName = canonicalName(name);
RODRecordList l = map.get(luName);
RODRecordList l = bindings.get(luName);
return l == null ? EMPTY_ROD_RECORD_LIST : l;
}
@ -448,7 +435,7 @@ public class RefMetaDataTracker {
* @param name the name of the rod
* @return canonical name of the rod
*/
private final String canonicalName(final String name) {
private String canonicalName(final String name) {
// todo -- remove me after switch to RodBinding syntax
return name.toLowerCase();
}

View File

@ -24,7 +24,7 @@
package org.broadinstitute.sting.gatk.resourcemanagement;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
/**
* Models how threads are distributed between various components of the GATK.
@ -33,61 +33,83 @@ public class ThreadAllocation {
/**
* The number of CPU threads to be used by the GATK.
*/
private final int numCPUThreads;
private final int numDataThreads;
/**
* The number of CPU threads per data thread for GATK processing
*/
private final int numCPUThreadsPerDataThread;
/**
* Number of threads to devote exclusively to IO. Default is 0.
*/
private final int numIOThreads;
public int getNumCPUThreads() {
return numCPUThreads;
/**
* Should we monitor thread efficiency?
*/
private final boolean monitorEfficiency;
public int getNumDataThreads() {
return numDataThreads;
}
public int getNumCPUThreadsPerDataThread() {
return numCPUThreadsPerDataThread;
}
public int getNumIOThreads() {
return numIOThreads;
}
public boolean monitorThreadEfficiency() {
return monitorEfficiency;
}
/**
* Are we running in parallel mode?
*
* @return true if any parallel processing is enabled
*/
public boolean isRunningInParallelMode() {
return getTotalNumThreads() > 1;
}
/**
* What is the total number of threads in use by the GATK?
*
* @return the sum of all thread allocations in this object
*/
public int getTotalNumThreads() {
return getNumDataThreads() * getNumCPUThreadsPerDataThread() + getNumIOThreads();
}
/**
* Construct the default thread allocation.
*/
public ThreadAllocation() {
this(1,null,null);
this(1, 1, 0, false);
}
/**
* Set up the thread allocation. Default allocation is 1 CPU thread, 0 IO threads.
* (0 IO threads means that no threads are devoted exclusively to IO; they're inline on the CPU thread).
* @param totalThreads Complete number of threads to allocate.
* @param numCPUThreads Total number of threads allocated to the traversal.
* @param numDataThreads Total number of threads allocated to the traversal.
* @param numCPUThreadsPerDataThread The number of CPU threads per data thread to allocate
* @param numIOThreads Total number of threads allocated exclusively to IO.
* @param monitorEfficiency should we monitor threading efficiency in the GATK?
*/
public ThreadAllocation(final int totalThreads, final Integer numCPUThreads, final Integer numIOThreads) {
// If no allocation information is present, allocate all threads to CPU
if(numCPUThreads == null && numIOThreads == null) {
this.numCPUThreads = totalThreads;
this.numIOThreads = 0;
}
// If only CPU threads are specified, allocate remainder to IO (minimum 0 dedicated IO threads).
else if(numIOThreads == null) {
if(numCPUThreads > totalThreads)
throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) is higher than the total threads",totalThreads,numCPUThreads));
this.numCPUThreads = numCPUThreads;
this.numIOThreads = totalThreads - numCPUThreads;
}
// If only IO threads are specified, allocate remainder to CPU (minimum 1 dedicated CPU thread).
else if(numCPUThreads == null) {
if(numIOThreads > totalThreads)
throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of io threads (%d) is higher than the total threads",totalThreads,numIOThreads));
this.numCPUThreads = Math.max(1,totalThreads-numIOThreads);
this.numIOThreads = numIOThreads;
}
else {
if(numCPUThreads + numIOThreads != totalThreads)
throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) + the count of io threads (%d) does not match",totalThreads,numCPUThreads,numIOThreads));
this.numCPUThreads = numCPUThreads;
this.numIOThreads = numIOThreads;
}
}
public ThreadAllocation(final int numDataThreads,
final int numCPUThreadsPerDataThread,
final int numIOThreads,
final boolean monitorEfficiency) {
if ( numDataThreads < 1 ) throw new ReviewedStingException("numDataThreads cannot be less than 1, but saw " + numDataThreads);
if ( numCPUThreadsPerDataThread < 1 ) throw new ReviewedStingException("numCPUThreadsPerDataThread cannot be less than 1, but saw " + numCPUThreadsPerDataThread);
if ( numIOThreads < 0 ) throw new ReviewedStingException("numIOThreads cannot be less than 0, but saw " + numIOThreads);
this.numDataThreads = numDataThreads;
this.numCPUThreadsPerDataThread = numCPUThreadsPerDataThread;
this.numIOThreads = numIOThreads;
this.monitorEfficiency = monitorEfficiency;
}
}

View File

@ -168,13 +168,70 @@ public class SampleDB {
return families;
}
/**
* Returns all the trios present in the sample database. The strictOneChild parameter determines
* whether multiple children of the same parents resolve to multiple trios, or are excluded
* @param strictOneChild - exclude pedigrees with >1 child for parental pair
* @return - all of the mother+father=child triplets, subject to strictOneChild
*/
public final Set<Trio> getTrios(boolean strictOneChild) {
Set<Trio> trioSet = new HashSet<Trio>();
for ( String familyString : getFamilyIDs() ) {
Set<Sample> family = getFamily(familyString);
for ( Sample sample : family) {
if ( sample.getParents().size() == 2 ) {
Trio trio = new Trio(sample.getMother(),sample.getFather(),sample);
trioSet.add(trio);
}
}
}
if ( strictOneChild )
trioSet = removeTriosWithSameParents(trioSet);
return trioSet;
}
/**
* Returns all the trios present in the db. See getTrios(boolean strictOneChild)
* @return all the trios present in the samples db.
*/
public final Set<Trio> getTrios() {
return getTrios(false);
}
/**
* Subsets a set of trios to only those with nonmatching founders. If two (or more) trio objects have
* the same mother and father, then both (all) are removed from the returned set.
* @param trios - a set of Trio objects
* @return those subset of Trio objects in the input set with nonmatching founders
*/
private Set<Trio> removeTriosWithSameParents(final Set<Trio> trios) {
Set<Trio> filteredTrios = new HashSet<Trio>();
filteredTrios.addAll(trios);
Set<Trio> triosWithSameParents = new HashSet<Trio>();
for ( Trio referenceTrio : filteredTrios ) {
for ( Trio compareTrio : filteredTrios ) {
if ( referenceTrio != compareTrio &&
referenceTrio.getFather().equals(compareTrio.getFather()) &&
referenceTrio.getMother().equals(compareTrio.getMother()) ) {
triosWithSameParents.add(referenceTrio);
triosWithSameParents.add(compareTrio);
}
}
}
filteredTrios.removeAll(triosWithSameParents);
return filteredTrios;
}
/**
* Returns the set of all children that have both of their parents.
* Note that if a family is composed of more than 1 child, each child is
* returned.
* @return - all the children that have both of their parents
* @deprecated - getTrios() replaces this function
*/
@Deprecated
public final Set<Sample> getChildrenWithParents(){
return getChildrenWithParents(false);
}
@ -188,7 +245,15 @@ public class SampleDB {
*
* @param triosOnly - if set to true, only strict trios are returned
* @return - all the children that have both of their parents
* @deprecated - getTrios(boolean strict) replaces this function
* @bug -- does not work for extracting multiple generations of trios, e.g.
* ..........Mom1------Dad1
* ................|
* ..............Child1--------Mom2
* .......................|
* .....................Child2
*/
@Deprecated
public final Set<Sample> getChildrenWithParents(boolean triosOnly) {
Map<String, Set<Sample>> families = getFamilies();

View File

@ -135,9 +135,8 @@ public class SampleDBBuilder {
// --------------------------------------------------------------------------------
protected final void validate() {
if ( validationStrictness == PedigreeValidationType.SILENT )
return;
else {
validatePedigreeIDUniqueness();
if ( validationStrictness != PedigreeValidationType.SILENT ) {
// check that samples in data sources are all annotated, if anything is annotated
if ( ! samplesFromPedigrees.isEmpty() && ! samplesFromDataSources.isEmpty() ) {
final Set<String> sampleNamesFromPedigrees = new HashSet<String>();
@ -150,4 +149,12 @@ public class SampleDBBuilder {
}
}
}
private void validatePedigreeIDUniqueness() {
Set<String> pedigreeIDs = new HashSet<String>();
for ( Sample sample : samplesFromPedigrees ) {
pedigreeIDs.add(sample.getID());
}
assert pedigreeIDs.size() == samplesFromPedigrees.size() : "The number of sample IDs extracted from the pedigree does not equal the number of samples in the pedigree. Is a sample associated with multiple families?";
}
}

View File

@ -0,0 +1,45 @@
package org.broadinstitute.sting.gatk.samples;
/**
* A class for imposing a trio structure on three samples; a common paradigm
*
* todo -- there should probably be an interface or abstract class "Pedigree" that generalizes the notion of
* -- imposing structure on samples. But given how complex pedigrees can quickly become, it's not
* -- clear the best way to do this.
*/
public class Trio {
private Sample mother;
private Sample father;
private Sample child;
public Trio(Sample mom, Sample dad, Sample spawn) {
assert mom.getID().equals(spawn.getMaternalID()) && dad.getID().equals(spawn.getPaternalID()) : "Samples passed to trio constructor do not form a trio";
mother = mom;
father = dad;
child = spawn;
}
public Sample getMother() {
return mother;
}
public String getMaternalID() {
return mother.getID();
}
public Sample getFather() {
return father;
}
public String getPaternalID() {
return father.getID();
}
public Sample getChild() {
return child;
}
public String getChildID() {
return child.getID();
}
}

View File

@ -44,24 +44,12 @@ import java.util.List;
import java.util.Map;
public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,ProviderType extends ShardDataProvider> {
/** our log, which we want to capture anything from this class */
protected static final Logger logger = Logger.getLogger(TraversalEngine.class);
// Time in milliseconds since we initialized this engine
private static final int HISTORY_WINDOW_SIZE = 50;
private static class ProcessingHistory {
double elapsedSeconds;
long unitsProcessed;
long bpProcessed;
GenomeLoc loc;
public ProcessingHistory(double elapsedSeconds, GenomeLoc loc, long unitsProcessed, long bpProcessed) {
this.elapsedSeconds = elapsedSeconds;
this.loc = loc;
this.unitsProcessed = unitsProcessed;
this.bpProcessed = bpProcessed;
}
}
/** lock object to sure updates to history are consistent across threads */
private static final Object lock = new Object();
LinkedList<ProcessingHistory> history = new LinkedList<ProcessingHistory>();
@ -70,13 +58,12 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
private SimpleTimer timer = null;
// How long can we go without printing some progress info?
private static final int PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES = 1000;
private int printProgressCheckCounter = 0;
private long lastProgressPrintTime = -1; // When was the last time we printed progress log?
private long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 30 * 1000; // in milliseconds
private long PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds
private final double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0;
private final double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0;
private final static long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 30 * 1000; // in milliseconds
private final static double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0;
private final static double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0;
private long progressPrintFrequency = 10 * 1000; // in milliseconds
private boolean progressMeterInitialized = false;
// for performance log
@ -85,15 +72,12 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
private File performanceLogFile;
private PrintStream performanceLog = null;
private long lastPerformanceLogPrintTime = -1; // When was the last time we printed to the performance log?
private final long PERFORMANCE_LOG_PRINT_FREQUENCY = PROGRESS_PRINT_FREQUENCY; // in milliseconds
private final long PERFORMANCE_LOG_PRINT_FREQUENCY = progressPrintFrequency; // in milliseconds
/** Size, in bp, of the area we are processing. Updated once in the system in initial for performance reasons */
long targetSize = -1;
GenomeLocSortedSet targetIntervals = null;
/** our log, which we want to capture anything from this class */
protected static final Logger logger = Logger.getLogger(TraversalEngine.class);
protected GenomeAnalysisEngine engine;
// ----------------------------------------------------------------------------------------------------
@ -186,15 +170,35 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
return elapsed > printFreq && elapsed > MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS;
}
/**
* Update the cumulative traversal metrics according to the data in this shard
*
* @param shard a non-null shard
*/
public void updateCumulativeMetrics(final Shard shard) {
updateCumulativeMetrics(shard.getReadMetrics());
}
/**
* Update the cumulative traversal metrics according to the data in this shard
*
* @param singleTraverseMetrics read metrics object containing the information about a single shard's worth
* of data processing
*/
public void updateCumulativeMetrics(final ReadMetrics singleTraverseMetrics) {
engine.getCumulativeMetrics().incrementMetrics(singleTraverseMetrics);
}
/**
* Forward request to printProgress
*
* @param shard the given shard currently being processed.
* Assumes that one cycle has been completed
*
* @param loc the location
*/
public void printProgress(Shard shard, GenomeLoc loc) {
public void printProgress(final GenomeLoc loc) {
// A bypass is inserted here for unit testing.
printProgress(loc,shard.getReadMetrics(),false);
printProgress(loc, false);
}
/**
@ -202,15 +206,10 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
* every M seconds, for N and M set in global variables.
*
* @param loc Current location, can be null if you are at the end of the traversal
* @param metrics Data processed since the last cumulative
* @param mustPrint If true, will print out info, regardless of nRecords or time interval
*/
private void printProgress(GenomeLoc loc, ReadMetrics metrics, boolean mustPrint) {
if ( mustPrint || printProgressCheckCounter++ % PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES != 0 )
// don't do any work more often than PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES
return;
if(!progressMeterInitialized && mustPrint == false ) {
private synchronized void printProgress(final GenomeLoc loc, boolean mustPrint) {
if( ! progressMeterInitialized ) {
logger.info("[INITIALIZATION COMPLETE; TRAVERSAL STARTING]");
logger.info(String.format("%15s processed.%s runtime per.1M.%s completed total.runtime remaining",
"Location", getTraversalType(), getTraversalType()));
@ -218,40 +217,34 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
}
final long curTime = timer.currentTime();
boolean printProgress = mustPrint || maxElapsedIntervalForPrinting(curTime, lastProgressPrintTime, PROGRESS_PRINT_FREQUENCY);
boolean printProgress = mustPrint || maxElapsedIntervalForPrinting(curTime, lastProgressPrintTime, progressPrintFrequency);
boolean printLog = performanceLog != null && maxElapsedIntervalForPrinting(curTime, lastPerformanceLogPrintTime, PERFORMANCE_LOG_PRINT_FREQUENCY);
if ( printProgress || printLog ) {
// getting and appending metrics data actually turns out to be quite a heavyweight
// operation. Postpone it until after determining whether to print the log message.
ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics() != null ? engine.getCumulativeMetrics() : new ReadMetrics();
if(metrics != null)
cumulativeMetrics.incrementMetrics(metrics);
final long nRecords = cumulativeMetrics.getNumIterations();
ProcessingHistory last = updateHistory(loc,cumulativeMetrics);
final ProcessingHistory last = updateHistory(loc, engine.getCumulativeMetrics());
final AutoFormattingTime elapsed = new AutoFormattingTime(last.elapsedSeconds);
final AutoFormattingTime bpRate = new AutoFormattingTime(secondsPerMillionBP(last));
final AutoFormattingTime unitRate = new AutoFormattingTime(secondsPerMillionElements(last));
final double fractionGenomeTargetCompleted = calculateFractionGenomeTargetCompleted(last);
final AutoFormattingTime bpRate = new AutoFormattingTime(last.secondsPerMillionBP());
final AutoFormattingTime unitRate = new AutoFormattingTime(last.secondsPerMillionElements());
final double fractionGenomeTargetCompleted = last.calculateFractionGenomeTargetCompleted(targetSize);
final AutoFormattingTime estTotalRuntime = new AutoFormattingTime(elapsed.getTimeInSeconds() / fractionGenomeTargetCompleted);
final AutoFormattingTime timeToCompletion = new AutoFormattingTime(estTotalRuntime.getTimeInSeconds() - elapsed.getTimeInSeconds());
final long nRecords = engine.getCumulativeMetrics().getNumIterations();
if ( printProgress ) {
lastProgressPrintTime = curTime;
// dynamically change the update rate so that short running jobs receive frequent updates while longer jobs receive fewer updates
if ( estTotalRuntime.getTimeInSeconds() > TWELVE_HOURS_IN_SECONDS )
PROGRESS_PRINT_FREQUENCY = 60 * 1000; // in milliseconds
progressPrintFrequency = 60 * 1000; // in milliseconds
else if ( estTotalRuntime.getTimeInSeconds() > TWO_HOURS_IN_SECONDS )
PROGRESS_PRINT_FREQUENCY = 30 * 1000; // in milliseconds
progressPrintFrequency = 30 * 1000; // in milliseconds
else
PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds
progressPrintFrequency = 10 * 1000; // in milliseconds
logger.info(String.format("%15s %5.2e %s %s %4.1f%% %s %s",
loc == null ? "done with mapped reads" : loc, nRecords*1.0, elapsed, unitRate,
final String posName = loc == null ? (mustPrint ? "done" : "unmapped reads") : String.format("%s:%d", loc.getContig(), loc.getStart());
logger.info(String.format("%15s %5.2e %s %s %5.1f%% %s %s",
posName, nRecords*1.0, elapsed, unitRate,
100*fractionGenomeTargetCompleted, estTotalRuntime, timeToCompletion));
}
@ -277,7 +270,7 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
* @param metrics information about what's been processed already
* @return
*/
private final ProcessingHistory updateHistory(GenomeLoc loc, ReadMetrics metrics) {
private ProcessingHistory updateHistory(GenomeLoc loc, ReadMetrics metrics) {
synchronized (lock) {
if ( history.size() > HISTORY_WINDOW_SIZE )
history.pop();
@ -290,26 +283,11 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
}
}
/** How long in seconds to process 1M traversal units? */
private final double secondsPerMillionElements(ProcessingHistory last) {
return (last.elapsedSeconds * 1000000.0) / Math.max(last.unitsProcessed, 1);
}
/** How long in seconds to process 1M bp on the genome? */
private final double secondsPerMillionBP(ProcessingHistory last) {
return (last.elapsedSeconds * 1000000.0) / Math.max(last.bpProcessed, 1);
}
/** What fractoin of the target intervals have we covered? */
private final double calculateFractionGenomeTargetCompleted(ProcessingHistory last) {
return (1.0*last.bpProcessed) / targetSize;
}
/**
* Called after a traversal to print out information about the traversal process
*/
public void printOnTraversalDone() {
printProgress(null, null, true);
printProgress(null, true);
final double elapsed = timer == null ? 0 : timer.getElapsedTime();
@ -370,7 +348,7 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
* @return Frequency, in seconds, of performance log writes.
*/
public long getPerformanceProgressPrintFrequencySeconds() {
return PROGRESS_PRINT_FREQUENCY;
return progressPrintFrequency;
}
/**
@ -378,6 +356,35 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
* @param seconds number of seconds between messages indicating performance frequency.
*/
public void setPerformanceProgressPrintFrequencySeconds(long seconds) {
PROGRESS_PRINT_FREQUENCY = seconds;
progressPrintFrequency = seconds;
}
private static class ProcessingHistory {
double elapsedSeconds;
long unitsProcessed;
long bpProcessed;
GenomeLoc loc;
public ProcessingHistory(double elapsedSeconds, GenomeLoc loc, long unitsProcessed, long bpProcessed) {
this.elapsedSeconds = elapsedSeconds;
this.loc = loc;
this.unitsProcessed = unitsProcessed;
this.bpProcessed = bpProcessed;
}
/** How long in seconds to process 1M traversal units? */
private double secondsPerMillionElements() {
return (elapsedSeconds * 1000000.0) / Math.max(unitsProcessed, 1);
}
/** How long in seconds to process 1M bp on the genome? */
private double secondsPerMillionBP() {
return (elapsedSeconds * 1000000.0) / Math.max(bpProcessed, 1);
}
/** What fractoin of the target intervals have we covered? */
private double calculateFractionGenomeTargetCompleted(final long targetSize) {
return (1.0*bpProcessed) / targetSize;
}
}
}

View File

@ -104,7 +104,8 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
prevLoc = location;
printProgress(dataProvider.getShard(), locus.getLocation());
updateCumulativeMetrics(dataProvider.getShard());
printProgress(locus.getLocation());
}
// Take the individual isActive calls and integrate them into contiguous active regions and
@ -185,7 +186,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
*/
private void writeActiveRegionsToStream( final ActiveRegionWalker<M,T> walker ) {
// Just want to output the active regions to a file, not actually process them
for( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion : workQueue ) {
for( final ActiveRegion activeRegion : workQueue ) {
if( activeRegion.isActive ) {
walker.activeRegionOutStream.println( activeRegion.getLocation() );
}
@ -198,7 +199,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
while( workQueue.peek() != null ) {
final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc();
if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) {
final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion = workQueue.remove();
final ActiveRegion activeRegion = workQueue.remove();
sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker );
} else {
break;
@ -208,15 +209,15 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
return sum;
}
private T processActiveRegion( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion, final LinkedHashSet<GATKSAMRecord> reads, final Queue<org.broadinstitute.sting.utils.activeregion.ActiveRegion> workQueue, final T sum, final ActiveRegionWalker<M,T> walker ) {
private T processActiveRegion( final ActiveRegion activeRegion, final LinkedHashSet<GATKSAMRecord> reads, final Queue<ActiveRegion> workQueue, final T sum, final ActiveRegionWalker<M,T> walker ) {
final ArrayList<GATKSAMRecord> placedReads = new ArrayList<GATKSAMRecord>();
for( final GATKSAMRecord read : reads ) {
final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read );
if( activeRegion.getLocation().overlapsP( readLoc ) ) {
// The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region)
long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc );
org.broadinstitute.sting.utils.activeregion.ActiveRegion bestRegion = activeRegion;
for( final org.broadinstitute.sting.utils.activeregion.ActiveRegion otherRegionToTest : workQueue ) {
ActiveRegion bestRegion = activeRegion;
for( final ActiveRegion otherRegionToTest : workQueue ) {
if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) {
maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc );
bestRegion = otherRegionToTest;
@ -229,7 +230,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
if( !bestRegion.equals(activeRegion) ) {
activeRegion.add( read );
}
for( final org.broadinstitute.sting.utils.activeregion.ActiveRegion otherRegionToTest : workQueue ) {
for( final ActiveRegion otherRegionToTest : workQueue ) {
if( !bestRegion.equals(otherRegionToTest) && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) {
otherRegionToTest.add( read );
}
@ -241,6 +242,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
}
}
reads.removeAll( placedReads ); // remove all the reads which have been placed into their active region
// WARNING: This hashset relies on reads being exactly equal when they are placed in the list as when they are removed. So the ActiveRegionWalker can't modify the reads in any way.
logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc());
final M x = walker.map( activeRegion, null );

View File

@ -196,7 +196,8 @@ public class TraverseDuplicates<M,T> extends TraversalEngine<M,T,DuplicateWalker
sum = walker.reduce(x, sum);
}
printProgress(dataProvider.getShard(),site);
updateCumulativeMetrics(dataProvider.getShard());
printProgress(site);
done = walker.isDone();
}

View File

@ -3,9 +3,7 @@ package org.broadinstitute.sting.gatk.traversals;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.WalkerManager;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.providers.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
@ -15,28 +13,42 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
/**
* A simple solution to iterating over all reference positions over a series of genomic locations.
*/
public class TraverseLoci<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,LocusShardDataProvider> {
public abstract class TraverseLociBase<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,LocusShardDataProvider> {
/**
* our log, which we want to capture anything from this class
*/
protected static final Logger logger = Logger.getLogger(TraversalEngine.class);
@Override
protected String getTraversalType() {
protected final String getTraversalType() {
return "sites";
}
protected static class TraverseResults<T> {
final int numIterations;
final T reduceResult;
public TraverseResults(int numIterations, T reduceResult) {
this.numIterations = numIterations;
this.reduceResult = reduceResult;
}
}
protected abstract TraverseResults<T> traverse( final LocusWalker<M,T> walker,
final LocusView locusView,
final LocusReferenceView referenceView,
final ReferenceOrderedView referenceOrderedDataView,
final T sum);
@Override
public T traverse( LocusWalker<M,T> walker,
LocusShardDataProvider dataProvider,
T sum) {
logger.debug(String.format("TraverseLoci.traverse: Shard is %s", dataProvider));
logger.debug(String.format("TraverseLociBase.traverse: Shard is %s", dataProvider));
LocusView locusView = getLocusView( walker, dataProvider );
boolean done = false;
final LocusView locusView = getLocusView( walker, dataProvider );
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
//ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider );
ReferenceOrderedView referenceOrderedDataView = null;
if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA )
@ -44,43 +56,24 @@ public class TraverseLoci<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,Locu
else
referenceOrderedDataView = (RodLocusView)locusView;
LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
// We keep processing while the next reference location is within the interval
while( locusView.hasNext() && ! done ) {
AlignmentContext locus = locusView.next();
GenomeLoc location = locus.getLocation();
dataProvider.getShard().getReadMetrics().incrementNumIterations();
// create reference context. Note that if we have a pileup of "extended events", the context will
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
ReferenceContext refContext = referenceView.getReferenceContext(location);
// Iterate forward to get all reference ordered data covering this location
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
final boolean keepMeP = walker.filter(tracker, refContext, locus);
if (keepMeP) {
M x = walker.map(tracker, refContext, locus);
sum = walker.reduce(x, sum);
done = walker.isDone();
}
printProgress(dataProvider.getShard(),locus.getLocation());
}
final TraverseResults<T> result = traverse( walker, locusView, referenceView, referenceOrderedDataView, sum );
sum = result.reduceResult;
dataProvider.getShard().getReadMetrics().incrementNumIterations(result.numIterations);
updateCumulativeMetrics(dataProvider.getShard());
}
// We have a final map call to execute here to clean up the skipped based from the
// last position in the ROD to that in the interval
if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) {
// only do this if the walker isn't done!
RodLocusView rodLocusView = (RodLocusView)locusView;
long nSkipped = rodLocusView.getLastSkippedBases();
final RodLocusView rodLocusView = (RodLocusView)locusView;
final long nSkipped = rodLocusView.getLastSkippedBases();
if ( nSkipped > 0 ) {
GenomeLoc site = rodLocusView.getLocOneBeyondShard();
AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped);
M x = walker.map(null, null, ac);
final GenomeLoc site = rodLocusView.getLocOneBeyondShard();
final AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped);
final M x = walker.map(null, null, ac);
sum = walker.reduce(x, sum);
}
}
@ -90,14 +83,14 @@ public class TraverseLoci<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,Locu
/**
* Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track'
* of sorts, providing a consistent interface so that TraverseLoci doesn't need to be reimplemented for any new datatype
* of sorts, providing a consistent interface so that TraverseLociBase doesn't need to be reimplemented for any new datatype
* that comes along.
* @param walker walker to interrogate.
* @param dataProvider Data which which to drive the locus view.
* @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal.
*/
private LocusView getLocusView( Walker<M,T> walker, LocusShardDataProvider dataProvider ) {
DataSource dataSource = WalkerManager.getWalkerDataSource(walker);
final DataSource dataSource = WalkerManager.getWalkerDataSource(walker);
if( dataSource == DataSource.READS )
return new CoveredLocusView(dataProvider);
else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers )

View File

@ -0,0 +1,47 @@
package org.broadinstitute.sting.gatk.traversals;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView;
import org.broadinstitute.sting.gatk.datasources.providers.LocusView;
import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
/**
* A simple solution to iterating over all reference positions over a series of genomic locations.
*/
public class TraverseLociLinear<M,T> extends TraverseLociBase<M,T> {
@Override
protected TraverseResults<T> traverse(LocusWalker<M, T> walker, LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView, T sum) {
// We keep processing while the next reference location is within the interval
boolean done = false;
int numIterations = 0;
while( locusView.hasNext() && ! done ) {
numIterations++;
final AlignmentContext locus = locusView.next();
final GenomeLoc location = locus.getLocation();
// create reference context. Note that if we have a pileup of "extended events", the context will
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
final ReferenceContext refContext = referenceView.getReferenceContext(location);
// Iterate forward to get all reference ordered data covering this location
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
final boolean keepMeP = walker.filter(tracker, refContext, locus);
if (keepMeP) {
final M x = walker.map(tracker, refContext, locus);
sum = walker.reduce(x, sum);
done = walker.isDone();
}
printProgress(locus.getLocation());
}
return new TraverseResults<T>(numIterations, sum);
}
}

View File

@ -0,0 +1,205 @@
package org.broadinstitute.sting.gatk.traversals;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView;
import org.broadinstitute.sting.gatk.datasources.providers.LocusView;
import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.nanoScheduler.NSMapFunction;
import org.broadinstitute.sting.utils.nanoScheduler.NSProgressFunction;
import org.broadinstitute.sting.utils.nanoScheduler.NSReduceFunction;
import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler;
import java.util.Iterator;
/**
* A simple solution to iterating over all reference positions over a series of genomic locations.
*/
public class TraverseLociNano<M,T> extends TraverseLociBase<M,T> {
/** our log, which we want to capture anything from this class */
private static final boolean DEBUG = false;
private static final int BUFFER_SIZE = 1000;
final NanoScheduler<MapData, MapResult, T> nanoScheduler;
public TraverseLociNano(int nThreads) {
nanoScheduler = new NanoScheduler<MapData, MapResult, T>(BUFFER_SIZE, nThreads);
nanoScheduler.setProgressFunction(new TraverseLociProgress());
}
@Override
protected TraverseResults<T> traverse(final LocusWalker<M, T> walker,
final LocusView locusView,
final LocusReferenceView referenceView,
final ReferenceOrderedView referenceOrderedDataView,
final T sum) {
nanoScheduler.setDebug(DEBUG);
final TraverseLociMap myMap = new TraverseLociMap(walker);
final TraverseLociReduce myReduce = new TraverseLociReduce(walker);
final MapDataIterator inputIterator = new MapDataIterator(locusView, referenceView, referenceOrderedDataView);
final T result = nanoScheduler.execute(inputIterator, myMap, sum, myReduce);
return new TraverseResults<T>(inputIterator.numIterations, result);
}
/**
* Create iterator that provides inputs for all map calls into MapData, to be provided
* to NanoScheduler for Map/Reduce
*/
private class MapDataIterator implements Iterator<MapData> {
final LocusView locusView;
final LocusReferenceView referenceView;
final ReferenceOrderedView referenceOrderedDataView;
int numIterations = 0;
private MapDataIterator(LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView) {
this.locusView = locusView;
this.referenceView = referenceView;
this.referenceOrderedDataView = referenceOrderedDataView;
}
@Override
public boolean hasNext() {
return locusView.hasNext();
}
@Override
public MapData next() {
final AlignmentContext locus = locusView.next();
final GenomeLoc location = locus.getLocation();
//logger.info("Pulling data from MapDataIterator at " + location);
// create reference context. Note that if we have a pileup of "extended events", the context will
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
final ReferenceContext refContext = referenceView.getReferenceContext(location);
// Iterate forward to get all reference ordered data covering this location
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(location, refContext);
numIterations++;
return new MapData(locus, refContext, tracker);
}
@Override
public void remove() {
throw new UnsupportedOperationException("Cannot remove elements from MapDataIterator");
}
}
@Override
public void printOnTraversalDone() {
nanoScheduler.shutdown();
super.printOnTraversalDone();
}
/**
* The input data needed for each map call. The read, the reference, and the RODs
*/
private class MapData {
final AlignmentContext alignmentContext;
final ReferenceContext refContext;
final RefMetaDataTracker tracker;
private MapData(final AlignmentContext alignmentContext, ReferenceContext refContext, RefMetaDataTracker tracker) {
this.alignmentContext = alignmentContext;
this.refContext = refContext;
this.tracker = tracker;
}
@Override
public String toString() {
return "MapData " + alignmentContext.getLocation();
}
}
/**
* Contains the results of a map call, indicating whether the call was good, filtered, or done
*/
private class MapResult {
final M value;
final boolean reduceMe;
/**
* Create a MapResult with value that should be reduced
*
* @param value the value to reduce
*/
private MapResult(final M value) {
this.value = value;
this.reduceMe = true;
}
/**
* Create a MapResult that shouldn't be reduced
*/
private MapResult() {
this.value = null;
this.reduceMe = false;
}
}
/**
* A static object that tells reduce that the result of map should be skipped (filtered or done)
*/
private final MapResult SKIP_REDUCE = new MapResult();
/**
* MapFunction for TraverseReads meeting NanoScheduler interface requirements
*
* Applies walker.map to MapData, returning a MapResult object containing the result
*/
private class TraverseLociMap implements NSMapFunction<MapData, MapResult> {
final LocusWalker<M,T> walker;
private TraverseLociMap(LocusWalker<M, T> walker) {
this.walker = walker;
}
@Override
public MapResult apply(final MapData data) {
if ( ! walker.isDone() ) {
final boolean keepMeP = walker.filter(data.tracker, data.refContext, data.alignmentContext);
if (keepMeP) {
final M x = walker.map(data.tracker, data.refContext, data.alignmentContext);
return new MapResult(x);
}
}
return SKIP_REDUCE;
}
}
/**
* NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements
*
* Takes a MapResult object and applies the walkers reduce function to each map result, when applicable
*/
private class TraverseLociReduce implements NSReduceFunction<MapResult, T> {
final LocusWalker<M,T> walker;
private TraverseLociReduce(LocusWalker<M, T> walker) {
this.walker = walker;
}
@Override
public T apply(MapResult one, T sum) {
if ( one.reduceMe )
// only run reduce on values that aren't DONE or FAILED
return walker.reduce(one.value, sum);
else
return sum;
}
}
private class TraverseLociProgress implements NSProgressFunction<MapData> {
@Override
public void progress(MapData lastProcessedMap) {
if (lastProcessedMap.alignmentContext != null)
printProgress(lastProcessedMap.alignmentContext.getLocation());
}
}
}

View File

@ -65,7 +65,8 @@ public class TraverseReadPairs<M,T> extends TraversalEngine<M,T, ReadPairWalker<
pairs.clear();
pairs.add(read);
printProgress(dataProvider.getShard(),null);
updateCumulativeMetrics(dataProvider.getShard());
printProgress(null);
}
done = walker.isDone();

View File

@ -1,20 +1,3 @@
package org.broadinstitute.sting.gatk.traversals;
import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.ReadMetrics;
import org.broadinstitute.sting.gatk.WalkerManager;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.providers.ReadBasedReferenceOrderedView;
import org.broadinstitute.sting.gatk.datasources.providers.ReadReferenceView;
import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.providers.ReadView;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
@ -39,6 +22,19 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.traversals;
import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.providers.ReadBasedReferenceOrderedView;
import org.broadinstitute.sting.gatk.datasources.providers.ReadReferenceView;
import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.providers.ReadView;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
/**
* @author aaron
@ -75,29 +71,27 @@ public class TraverseReads<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,Read
if( !dataProvider.hasReads() )
throw new IllegalArgumentException("Unable to traverse reads; no read data is available.");
ReadView reads = new ReadView(dataProvider);
ReadReferenceView reference = new ReadReferenceView(dataProvider);
final ReadView reads = new ReadView(dataProvider);
final ReadReferenceView reference = new ReadReferenceView(dataProvider);
// get the reference ordered data
ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider);
final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider);
boolean done = walker.isDone();
// while we still have more reads
for (SAMRecord read : reads) {
for (final SAMRecord read : reads) {
if ( done ) break;
// ReferenceContext -- the reference bases covered by the read
ReferenceContext refContext = null;
// get the array of characters for the reference sequence, since we're a mapped read
if (!read.getReadUnmappedFlag() && dataProvider.hasReference())
refContext = reference.getReferenceContext(read);
// ReferenceContext -- the reference bases covered by the read
final ReferenceContext refContext = ! read.getReadUnmappedFlag() && dataProvider.hasReference()
? reference.getReferenceContext(read)
: null;
// update the number of reads we've seen
ReadMetrics readMetrics = dataProvider.getShard().getReadMetrics();
readMetrics.incrementNumIterations();
dataProvider.getShard().getReadMetrics().incrementNumIterations();
// if the read is mapped, create a metadata tracker
ReadMetaDataTracker tracker = (read.getReferenceIndex() >= 0) ? rodView.getReferenceOrderedDataForRead(read) : null;
final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 ? rodView.getReferenceOrderedDataForRead(read) : null;
final boolean keepMeP = walker.filter(refContext, (GATKSAMRecord) read);
if (keepMeP) {
@ -105,8 +99,11 @@ public class TraverseReads<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,Read
sum = walker.reduce(x, sum);
}
GenomeLoc locus = read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ? null : engine.getGenomeLocParser().createGenomeLoc(read.getReferenceName(),read.getAlignmentStart());
printProgress(dataProvider.getShard(),locus);
final GenomeLoc locus = read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ? null : engine.getGenomeLocParser().createGenomeLoc(read.getReferenceName(),read.getAlignmentStart());
updateCumulativeMetrics(dataProvider.getShard());
printProgress(locus);
done = walker.isDone();
}
return sum;

View File

@ -0,0 +1,234 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.traversals;
import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.providers.ReadBasedReferenceOrderedView;
import org.broadinstitute.sting.gatk.datasources.providers.ReadReferenceView;
import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.providers.ReadView;
import org.broadinstitute.sting.gatk.datasources.reads.ReadShard;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.nanoScheduler.NSMapFunction;
import org.broadinstitute.sting.utils.nanoScheduler.NSReduceFunction;
import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.LinkedList;
import java.util.List;
/**
* A nano-scheduling version of TraverseReads.
*
* Implements the traversal of a walker that accepts individual reads, the reference, and
* RODs per map call. Directly supports shared memory parallelism via NanoScheduler
*
* @author depristo
* @version 1.0
* @date 9/2/2012
*/
public class TraverseReadsNano<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,ReadShardDataProvider> {
/** our log, which we want to capture anything from this class */
protected static final Logger logger = Logger.getLogger(TraverseReadsNano.class);
private static final boolean DEBUG = false;
final NanoScheduler<MapData, MapResult, T> nanoScheduler;
public TraverseReadsNano(int nThreads) {
final int bufferSize = ReadShard.getReadBufferSize() + 1; // actually has 1 more than max
nanoScheduler = new NanoScheduler<MapData, MapResult, T>(bufferSize, nThreads);
}
@Override
protected String getTraversalType() {
return "reads";
}
/**
* Traverse by reads, given the data and the walker
*
* @param walker the walker to traverse with
* @param dataProvider the provider of the reads data
* @param sum the value of type T, specified by the walker, to feed to the walkers reduce function
* @return the reduce variable of the read walker
*/
public T traverse(ReadWalker<M,T> walker,
ReadShardDataProvider dataProvider,
T sum) {
logger.debug(String.format("TraverseReadsNano.traverse Covered dataset is %s", dataProvider));
if( !dataProvider.hasReads() )
throw new IllegalArgumentException("Unable to traverse reads; no read data is available.");
nanoScheduler.setDebug(DEBUG);
final TraverseReadsMap myMap = new TraverseReadsMap(walker);
final TraverseReadsReduce myReduce = new TraverseReadsReduce(walker);
final List<MapData> aggregatedInputs = aggregateMapData(dataProvider);
final T result = nanoScheduler.execute(aggregatedInputs.iterator(), myMap, sum, myReduce);
final GATKSAMRecord lastRead = aggregatedInputs.get(aggregatedInputs.size() - 1).read;
final GenomeLoc locus = engine.getGenomeLocParser().createGenomeLoc(lastRead);
updateCumulativeMetrics(dataProvider.getShard());
printProgress(locus);
return result;
}
/**
* Aggregate all of the inputs for all map calls into MapData, to be provided
* to NanoScheduler for Map/Reduce
*
* @param dataProvider the source of our data
* @return a linked list of MapData objects holding the read, ref, and ROD info for every map/reduce
* should execute
*/
private List<MapData> aggregateMapData(final ReadShardDataProvider dataProvider) {
final ReadView reads = new ReadView(dataProvider);
final ReadReferenceView reference = new ReadReferenceView(dataProvider);
final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider);
final List<MapData> mapData = new LinkedList<MapData>();
for ( final SAMRecord read : reads ) {
final ReferenceContext refContext = ! read.getReadUnmappedFlag()
? reference.getReferenceContext(read)
: null;
// if the read is mapped, create a metadata tracker
final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0
? rodView.getReferenceOrderedDataForRead(read)
: null;
// update the number of reads we've seen
dataProvider.getShard().getReadMetrics().incrementNumIterations();
mapData.add(new MapData((GATKSAMRecord)read, refContext, tracker));
}
return mapData;
}
@Override
public void printOnTraversalDone() {
nanoScheduler.shutdown();
super.printOnTraversalDone();
}
/**
* The input data needed for each map call. The read, the reference, and the RODs
*/
private class MapData {
final GATKSAMRecord read;
final ReferenceContext refContext;
final RefMetaDataTracker tracker;
private MapData(GATKSAMRecord read, ReferenceContext refContext, RefMetaDataTracker tracker) {
this.read = read;
this.refContext = refContext;
this.tracker = tracker;
}
}
/**
* Contains the results of a map call, indicating whether the call was good, filtered, or done
*/
private class MapResult {
final M value;
final boolean reduceMe;
/**
* Create a MapResult with value that should be reduced
*
* @param value the value to reduce
*/
private MapResult(final M value) {
this.value = value;
this.reduceMe = true;
}
/**
* Create a MapResult that shouldn't be reduced
*/
private MapResult() {
this.value = null;
this.reduceMe = false;
}
}
/**
* A static object that tells reduce that the result of map should be skipped (filtered or done)
*/
private final MapResult SKIP_REDUCE = new MapResult();
/**
* MapFunction for TraverseReads meeting NanoScheduler interface requirements
*
* Applies walker.map to MapData, returning a MapResult object containing the result
*/
private class TraverseReadsMap implements NSMapFunction<MapData, MapResult> {
final ReadWalker<M,T> walker;
private TraverseReadsMap(ReadWalker<M, T> walker) {
this.walker = walker;
}
@Override
public MapResult apply(final MapData data) {
if ( ! walker.isDone() ) {
final boolean keepMeP = walker.filter(data.refContext, data.read);
if (keepMeP)
return new MapResult(walker.map(data.refContext, data.read, data.tracker));
}
return SKIP_REDUCE;
}
}
/**
* NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements
*
* Takes a MapResult object and applies the walkers reduce function to each map result, when applicable
*/
private class TraverseReadsReduce implements NSReduceFunction<MapResult, T> {
final ReadWalker<M,T> walker;
private TraverseReadsReduce(ReadWalker<M, T> walker) {
this.walker = walker;
}
@Override
public T apply(MapResult one, T sum) {
if ( one.reduceMe )
// only run reduce on values that aren't DONE or FAILED
return walker.reduce(one.value, sum);
else
return sum;
}
}
}

View File

@ -12,6 +12,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
@ -77,7 +78,7 @@ public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<Map
public abstract ActivityProfileResult isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context);
// Map over the ActiveRegion
public abstract MapType map(final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker);
public abstract MapType map(final ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker);
public final GenomeLocSortedSet extendIntervals( final GenomeLocSortedSet intervals, final GenomeLocParser genomeLocParser, IndexedFastaSequenceFile reference ) {
final int activeRegionExtension = this.getClass().getAnnotation(ActiveRegionExtension.class).extension();

View File

@ -1,5 +1,7 @@
package org.broadinstitute.sting.gatk.walkers;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import java.lang.annotation.*;
/**
@ -25,5 +27,5 @@ import java.lang.annotation.*;
@Target(ElementType.TYPE)
public @interface BAQMode {
public abstract org.broadinstitute.sting.utils.baq.BAQ.QualityMode QualityMode() default org.broadinstitute.sting.utils.baq.BAQ.QualityMode.OVERWRITE_QUALS;
public abstract org.broadinstitute.sting.utils.baq.BAQ.ApplicationTime ApplicationTime() default org.broadinstitute.sting.utils.baq.BAQ.ApplicationTime.ON_INPUT;
public abstract ReadTransformer.ApplicationTime ApplicationTime() default ReadTransformer.ApplicationTime.ON_INPUT;
}

View File

@ -36,7 +36,7 @@ import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.clipping.ClippingOp;
@ -299,7 +299,7 @@ public class ClipReads extends ReadWalker<ClipReads.ReadClipperWithData, ClipRea
* @param read the read itself, as a GATKSAMRecord
* @return the ReadClipper object describing what should be done to clip this read
*/
public ReadClipperWithData map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
public ReadClipperWithData map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) {
if ( onlyDoRead == null || read.getReadName().equals(onlyDoRead) ) {
if ( clippingRepresentation == ClippingRepresentation.HARDCLIP_BASES )
read = ReadClipper.revertSoftClippedBases(read);

View File

@ -1,6 +1,6 @@
package org.broadinstitute.sting.gatk.walkers;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import java.lang.annotation.*;

View File

@ -3,7 +3,7 @@ package org.broadinstitute.sting.gatk.walkers;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -45,12 +45,12 @@ import java.text.NumberFormat;
*/
@DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} )
@Requires({DataSource.READS})
public class FlagStat extends ReadWalker<Integer, Integer> {
public class FlagStat extends ReadWalker<FlagStat.FlagStatus, FlagStat.FlagStatus> implements NanoSchedulable {
@Output
PrintStream out;
// what comes out of the flagstat
static class FlagStatus {
public final static class FlagStatus {
long readCount = 0L;
long QC_failure = 0L;
long duplicates = 0L;
@ -117,62 +117,84 @@ public class FlagStat extends ReadWalker<Integer, Integer> {
return builder.toString();
}
}
public FlagStatus add(final FlagStatus other) {
readCount += other.readCount;
QC_failure += other.QC_failure;
duplicates += other.duplicates;
mapped += other.mapped;
paired_in_sequencing += other.paired_in_sequencing;
read1 += other.read1;
read2 += other.read2;
properly_paired += other.properly_paired;
with_itself_and_mate_mapped += other.with_itself_and_mate_mapped;
singletons += other.singletons;
with_mate_mapped_to_a_different_chr += other.with_mate_mapped_to_a_different_chr;
with_mate_mapped_to_a_different_chr_maq_greaterequal_than_5 += other.with_mate_mapped_to_a_different_chr_maq_greaterequal_than_5;
private FlagStatus myStat = new FlagStatus();
public Integer map( ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) {
myStat.readCount++;
if (read.getReadFailsVendorQualityCheckFlag()) {
myStat.QC_failure++;
return this;
}
if (read.getDuplicateReadFlag()) {
myStat.duplicates++;
}
if (!read.getReadUnmappedFlag()) {
myStat.mapped++;
}
if (read.getReadPairedFlag()) {
myStat.paired_in_sequencing++;
if (read.getSecondOfPairFlag()) {
myStat.read2++;
} else if (read.getReadPairedFlag()) {
myStat.read1++;
public FlagStatus add(final GATKSAMRecord read) {
this.readCount++;
if (read.getReadFailsVendorQualityCheckFlag()) {
this.QC_failure++;
}
if (read.getProperPairFlag()) {
myStat.properly_paired++;
if (read.getDuplicateReadFlag()) {
this.duplicates++;
}
if (!read.getReadUnmappedFlag() && !read.getMateUnmappedFlag()) {
myStat.with_itself_and_mate_mapped++;
if (!read.getReadUnmappedFlag()) {
this.mapped++;
}
if (read.getReadPairedFlag()) {
this.paired_in_sequencing++;
if (!read.getReferenceIndex().equals(read.getMateReferenceIndex())) {
myStat.with_mate_mapped_to_a_different_chr++;
if (read.getSecondOfPairFlag()) {
this.read2++;
} else if (read.getReadPairedFlag()) {
this.read1++;
}
if (read.getProperPairFlag()) {
this.properly_paired++;
}
if (!read.getReadUnmappedFlag() && !read.getMateUnmappedFlag()) {
this.with_itself_and_mate_mapped++;
if (read.getMappingQuality() >= 5) {
myStat.with_mate_mapped_to_a_different_chr_maq_greaterequal_than_5++;
if (!read.getReferenceIndex().equals(read.getMateReferenceIndex())) {
this.with_mate_mapped_to_a_different_chr++;
if (read.getMappingQuality() >= 5) {
this.with_mate_mapped_to_a_different_chr_maq_greaterequal_than_5++;
}
}
}
if (!read.getReadUnmappedFlag() && read.getMateUnmappedFlag()) {
this.singletons++;
}
}
if (!read.getReadUnmappedFlag() && read.getMateUnmappedFlag()) {
myStat.singletons++;
}
return this;
}
return 1;
}
public Integer reduceInit() {
return 0;
@Override
public FlagStatus map( final ReferenceContext ref, final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker ) {
return new FlagStatus().add(read);
}
@Override
public FlagStatus reduceInit() {
return new FlagStatus();
}
public Integer reduce(Integer value, Integer sum) {
return value + sum;
@Override
public FlagStatus reduce(final FlagStatus value, final FlagStatus sum) {
return sum.add(value);
}
public void onTraversalDone(Integer result) {
//out.println("[REDUCE RESULT] Traversal result is: " + result);
out.println(myStat.toString());
@Override
public void onTraversalDone(final FlagStatus result) {
out.println(result.toString());
}
}

View File

@ -0,0 +1,31 @@
/*
* Copyright (c) 2010. The Broad Institute
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers;
/**
* Root parallelism interface. Walkers that implement this
* declare that their map function is thread-safe and so multiple
* map calls can be run in parallel in the same JVM instance.
*/
public interface NanoSchedulable {
}

View File

@ -45,25 +45,14 @@ import java.util.Collections;
import java.util.List;
/**
* Prints the alignment in the pileup format. In the pileup format, each line represents a genomic position,
* consisting of chromosome name, coordinate, reference base, read bases, read qualities and alignment mapping
* qualities. Information on match, mismatch, indel, strand, mapping quality and start and end of a read are all
* encoded at the read base column. At this column, a dot stands for a match to the reference base on the forward strand,
* a comma for a match on the reverse strand, 'ACGTN' for a mismatch on the forward strand and 'acgtn' for a mismatch on the
* reverse strand.
*
* A pattern '\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between this reference position and the next
* reference position. The length of the insertion is given by the integer in the pattern, followed by the inserted sequence.
* Similarly, a pattern '-[0-9]+[ACGTNacgtn]+' represents a deletion from the reference.
* Also at the read base column, a symbol '^' marks the start of a read segment which is a contiguous subsequence on the read
* separated by 'N/S/H' CIGAR operations. The ASCII of the character following '^' minus 33 gives the mapping quality.
* A symbol '$' marks the end of a read segment.
* Prints the alignment in something similar to the samtools pileup format. Each line represents a genomic position,
* consisting of chromosome name, coordinate, reference base, read bases, and read qualities.
*
* Associated command:
* samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list] [-iscg] [-T theta] [-N nHap] [-r pairDiffRate] <in.alignment>
*/
@DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} )
public class Pileup extends LocusWalker<Integer, Integer> implements TreeReducible<Integer> {
public class Pileup extends LocusWalker<String, Integer> implements TreeReducible<Integer>, NanoSchedulable {
private static final String verboseDelimiter = "@"; // it's ugly to use "@" but it's literally the only usable character not allowed in read names
@ -81,27 +70,32 @@ public class Pileup extends LocusWalker<Integer, Integer> implements TreeReducib
@Input(fullName="metadata",shortName="metadata",doc="Add these ROD bindings to the output Pileup", required=false)
public List<RodBinding<Feature>> rods = Collections.emptyList();
public void initialize() {
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
String rods = getReferenceOrderedData( tracker );
@Override
public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
final String rods = getReferenceOrderedData( tracker );
ReadBackedPileup basePileup = context.getBasePileup();
out.printf("%s %s", basePileup.getPileupString((char)ref.getBase()), rods);
if ( SHOW_VERBOSE )
out.printf(" %s", createVerboseOutput(basePileup));
out.println();
return 1;
final StringBuilder s = new StringBuilder();
s.append(String.format("%s %s", basePileup.getPileupString((char)ref.getBase()), rods));
if ( SHOW_VERBOSE )
s.append(" ").append(createVerboseOutput(basePileup));
s.append("\n");
return s.toString();
}
// Given result of map function
@Override
public Integer reduceInit() { return 0; }
public Integer reduce(Integer value, Integer sum) {
return treeReduce(sum,value);
@Override
public Integer reduce(String value, Integer sum) {
out.print(value);
return sum + 1;
}
@Override
public Integer treeReduce(Integer lhs, Integer rhs) {
return lhs + rhs;
}

View File

@ -32,17 +32,16 @@ import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.io.File;
import java.util.Collection;
import java.util.Random;
import java.util.Set;
import java.util.TreeSet;
import java.util.*;
/**
* Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear in the input file.
@ -91,9 +90,10 @@ import java.util.TreeSet;
*
*/
@DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} )
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
@ReadTransformersMode(ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER)
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER)
@Requires({DataSource.READS, DataSource.REFERENCE})
public class PrintReads extends ReadWalker<GATKSAMRecord, SAMFileWriter> {
public class PrintReads extends ReadWalker<GATKSAMRecord, SAMFileWriter> implements NanoSchedulable {
@Output(doc="Write output to this BAM filename instead of STDOUT", required = true)
SAMFileWriter out;
@ -138,6 +138,7 @@ public class PrintReads extends ReadWalker<GATKSAMRecord, SAMFileWriter> {
public boolean simplifyReads = false;
List<ReadTransformer> readTransformers = Collections.emptyList();
private TreeSet<String> samplesToChoose = new TreeSet<String>();
private boolean SAMPLES_SPECIFIED = false;
@ -150,6 +151,9 @@ public class PrintReads extends ReadWalker<GATKSAMRecord, SAMFileWriter> {
if ( platform != null )
platform = platform.toUpperCase();
if ( getToolkit() != null )
readTransformers = getToolkit().getReadTransformers();
Collection<String> samplesFromFile;
if (!sampleFile.isEmpty()) {
samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFile);
@ -217,11 +221,19 @@ public class PrintReads extends ReadWalker<GATKSAMRecord, SAMFileWriter> {
* The reads map function.
*
* @param ref the reference bases that correspond to our read, if a reference was provided
* @param read the read itself, as a GATKSAMRecord
* @param readIn the read itself, as a GATKSAMRecord
* @return the read itself
*/
public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) {
return simplifyReads ? read.simplify() : read;
public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord readIn, RefMetaDataTracker metaDataTracker ) {
GATKSAMRecord workingRead = readIn;
for ( final ReadTransformer transformer : readTransformers ) {
workingRead = transformer.apply(workingRead);
}
if ( simplifyReads ) workingRead = workingRead.simplify();
return workingRead;
}
/**
@ -245,5 +257,4 @@ public class PrintReads extends ReadWalker<GATKSAMRecord, SAMFileWriter> {
output.addAlignment(read);
return output;
}
}

View File

@ -1,8 +1,7 @@
package org.broadinstitute.sting.gatk.walkers;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
/**
@ -27,5 +26,5 @@ public abstract class ReadWalker<MapType, ReduceType> extends Walker<MapType, Re
}
// Map over the org.broadinstitute.sting.gatk.contexts.AlignmentContext
public abstract MapType map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker);
public abstract MapType map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker);
}

View File

@ -33,7 +33,7 @@ import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
@ -64,7 +64,7 @@ public class SplitSamFile extends ReadWalker<SAMRecord, Map<String, SAMFileWrite
logger.info("SplitSamFile version: " + VERSION);
}
public SAMRecord map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
public SAMRecord map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) {
return read;
}

View File

@ -30,12 +30,14 @@ import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.filters.MalformedReadFilter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.samples.Sample;
import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.recalibration.BQSRMode;
import java.util.List;
@ -48,7 +50,8 @@ import java.util.List;
*/
@ReadFilters(MalformedReadFilter.class)
@PartitionBy(PartitionType.NONE)
@BAQMode(QualityMode = BAQ.QualityMode.OVERWRITE_QUALS, ApplicationTime = BAQ.ApplicationTime.ON_INPUT)
@BAQMode(QualityMode = BAQ.QualityMode.OVERWRITE_QUALS, ApplicationTime = ReadTransformer.ApplicationTime.ON_INPUT)
@BQSRMode(ApplicationTime = ReadTransformer.ApplicationTime.ON_INPUT)
@DocumentedGATKFeature(groupName = "Uncategorized", extraDocs = {CommandLineGATK.class})
public abstract class Walker<MapType, ReduceType> {
final protected static Logger logger = Logger.getLogger(Walker.class);

View File

@ -33,6 +33,9 @@ public class AlleleBalanceBySample extends GenotypeAnnotation implements Experim
final Genotype g,
final GenotypeBuilder gb,
final PerReadAlleleLikelihoodMap alleleLikelihoodMap){
if ( stratifiedContext == null )
return;
Double ratio = annotateSNP(stratifiedContext, vc, g);
if (ratio == null)
return;

View File

@ -16,7 +16,7 @@ import java.util.*;
/**
* The u-based z-approximation from the Mann-Whitney Rank Sum Test for base qualities (ref bases vs. bases of the alternate allele).
* Note that the base quality rank sum test can not be calculated for homozygous sites.
* Note that the base quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
*/
public class BaseQualityRankSumTest extends RankSumTest implements StandardAnnotation {
public List<String> getKeyNames() { return Arrays.asList("BaseQRankSum"); }

View File

@ -16,6 +16,10 @@ import java.util.*;
* Date: 6/28/12
*/
/**
* The u-based z-approximation from the Mann-Whitney Rank Sum Test for reads with clipped bases (reads with ref bases vs. those with the alternate allele)
* Note that the clipping rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
*/
public class ClippingRankSumTest extends RankSumTest {
public List<String> getKeyNames() { return Arrays.asList("ClippingRankSum"); }

View File

@ -54,7 +54,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
final Genotype g,
final GenotypeBuilder gb,
final PerReadAlleleLikelihoodMap alleleLikelihoodMap) {
if ( g == null || !g.isCalled() )
if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) )
return;
if (alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty())
@ -97,7 +97,6 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
if (!vc.getAlleles().contains(a))
continue; // sanity check - shouldn't be needed
alleleCounts.put(a,alleleCounts.get(a)+1);
}
final int[] counts = new int[alleleCounts.size()];
counts[0] = alleleCounts.get(vc.getReference());

View File

@ -32,13 +32,11 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@ -55,6 +53,8 @@ import java.util.*;
public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
private static final String FS = "FS";
private static final double MIN_PVALUE = 1E-320;
private static final int MIN_QUAL_FOR_FILTERED_TEST = 17;
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
final ReferenceContext ref,
@ -64,30 +64,53 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
if ( !vc.isVariant() )
return null;
int[][] table;
if (vc.isSNP() && stratifiedContexts != null) {
table = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount());
final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), -1);
final int[][] tableFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), MIN_QUAL_FOR_FILTERED_TEST);
return pValueForBestTable(tableFiltering, tableNoFiltering);
}
else if (stratifiedPerReadAlleleLikelihoodMap != null) {
// either SNP with no alignment context, or indels: per-read likelihood map needed
table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount());
final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount());
return pValueForBestTable(table, null);
}
else
// for non-snp variants, we need per-read likelihoods.
// for snps, we can get same result from simple pileup
// for non-snp variants, we need per-read likelihoods.
// for snps, we can get same result from simple pileup
return null;
}
if (table == null)
return null;
/**
* Create an annotation for the highest (i.e., least significant) p-value of table1 and table2
*
* @param table1 a contingency table, may be null
* @param table2 a contingency table, may be null
* @return annotation result for FS given tables
*/
private Map<String, Object> pValueForBestTable(final int[][] table1, final int[][] table2) {
if ( table2 == null )
return table1 == null ? null : annotationForOneTable(pValueForContingencyTable(table1));
else if (table1 == null)
return annotationForOneTable(pValueForContingencyTable(table2));
else { // take the one with the best (i.e., least significant pvalue)
double pvalue1 = Math.max(pValueForContingencyTable(table1), MIN_PVALUE);
double pvalue2 = Math.max(pValueForContingencyTable(table2), MIN_PVALUE);
return annotationForOneTable(Math.max(pvalue1, pvalue2));
}
}
Double pvalue = Math.max(pValueForContingencyTable(table), MIN_PVALUE);
if ( pvalue == null )
return null;
Map<String, Object> map = new HashMap<String, Object>();
map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pvalue)));
return map;
/**
* Returns an annotation result given a pValue
*
* @param pValue
* @return a hash map from FS -> phred-scaled pValue
*/
private Map<String, Object> annotationForOneTable(final double pValue) {
final Object value = String.format("%.3f", QualityUtils.phredScaleErrorRate(pValue));
return Collections.singletonMap(FS, value);
// Map<String, Object> map = new HashMap<String, Object>();
// map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pValue)));
// return map;
}
public List<String> getKeyNames() {
@ -244,7 +267,10 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
* allele2 # #
* @return a 2x2 contingency table
*/
private static int[][] getSNPContingencyTable(Map<String, AlignmentContext> stratifiedContexts, Allele ref, Allele alt) {
private static int[][] getSNPContingencyTable(final Map<String, AlignmentContext> stratifiedContexts,
final Allele ref,
final Allele alt,
final int minQScoreToConsider ) {
int[][] table = new int[2][2];
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
@ -252,8 +278,11 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
if ( ! RankSumTest.isUsableBase(p, false) || p.getRead().isReducedRead() ) // ignore deletions and reduced reads
continue;
Allele base = Allele.create(p.getBase(), false);
boolean isFW = !p.getRead().getReadNegativeStrandFlag();
if ( p.getQual() < minQScoreToConsider || p.getMappingQual() < minQScoreToConsider )
continue;
final Allele base = Allele.create(p.getBase(), false);
final boolean isFW = !p.getRead().getReadNegativeStrandFlag();
final boolean matchesRef = ref.equals(base, true);
final boolean matchesAlt = alt.equals(base, true);
@ -268,6 +297,4 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
return table;
}
}

View File

@ -3,14 +3,12 @@ package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.samples.Sample;
import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.gatk.samples.Trio;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.MendelianViolation;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
@ -21,21 +19,17 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 9/14/11
* Time: 12:24 PM
* Given a variant context, uses the genotype likelihoods to assess the likelihood of the site being a mendelian violation
* versus the likelihood of the site transmitting according to mendelian rules. This assumes that the organism is
* diploid. When multiple trios are present, the annotation is simply the maximum of the likelihood ratios, rather than
* the strict 1-(1-p_i) calculation, as this can scale poorly for uncertain sites and many trios.
*/
public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation {
private MendelianViolation mendelianViolation = null;
public static final String MVLR_KEY = "MVLR";
private Set<Trio> trios;
private class Trio {
String motherId;
String fatherId;
String childId;
}
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
@ -44,7 +38,8 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment
final VariantContext vc,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) {
if ( mendelianViolation == null ) {
if (checkAndSetSamples(((Walker) walker).getSampleDB())) {
trios = ((Walker) walker).getSampleDB().getTrios();
if ( trios.size() > 0 ) {
mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).minGenotypeQualityP );
}
else {
@ -52,15 +47,12 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment
}
}
Map<String,Object> toRet = new HashMap<String,Object>(1);
Map<String,Object> attributeMap = new HashMap<String,Object>(1);
//double pNoMV = 1.0;
double maxMVLR = Double.MIN_VALUE;
for ( Trio trio : trios ) {
boolean hasAppropriateGenotypes = vc.hasGenotype(trio.motherId) && vc.getGenotype(trio.motherId).hasLikelihoods() &&
vc.hasGenotype(trio.fatherId) && vc.getGenotype(trio.fatherId).hasLikelihoods() &&
vc.hasGenotype(trio.childId) && vc.getGenotype(trio.childId).hasLikelihoods();
if ( hasAppropriateGenotypes ) {
Double likR = mendelianViolation.violationLikelihoodRatio(vc,trio.motherId,trio.fatherId,trio.childId);
if ( contextHasTrioLikelihoods(vc,trio) ) {
Double likR = mendelianViolation.violationLikelihoodRatio(vc,trio.getMaternalID(),trio.getPaternalID(),trio.getChildID());
maxMVLR = likR > maxMVLR ? likR : maxMVLR;
//pNoMV *= (1.0-Math.pow(10.0,likR)/(1+Math.pow(10.0,likR)));
}
@ -68,34 +60,26 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment
//double pSomeMV = 1.0-pNoMV;
//toRet.put("MVLR",Math.log10(pSomeMV)-Math.log10(1.0-pSomeMV));
toRet.put("MVLR",maxMVLR);
return toRet;
if ( Double.compare(maxMVLR,Double.MIN_VALUE) != 0 )
attributeMap.put(MVLR_KEY,maxMVLR);
return attributeMap;
}
// return the descriptions used for the VCF INFO meta field
public List<String> getKeyNames() { return Arrays.asList("MVLR"); }
public List<String> getKeyNames() { return Arrays.asList(MVLR_KEY); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MVLR", 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(MVLR_KEY, 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); }
private boolean checkAndSetSamples(SampleDB db){
trios = new HashSet<Trio>();
Set<String> families = db.getFamilyIDs();
for ( String familyString : families ) {
Set<Sample> family = db.getFamily(familyString);
Iterator<Sample> sampleIterator = family.iterator();
Sample sample;
for ( sample = sampleIterator.next(); sampleIterator.hasNext(); sample=sampleIterator.next()) {
if ( sample.getParents().size() == 2 ) {
Trio trio = new Trio();
trio.childId = sample.getID();
trio.fatherId = sample.getFather().getID();
trio.motherId = sample.getMother().getID();
trios.add(trio);
}
}
private boolean contextHasTrioLikelihoods(VariantContext context, Trio trio) {
for ( String sample : Arrays.asList(trio.getMaternalID(),trio.getPaternalID(),trio.getChildID()) ) {
if ( ! context.hasGenotype(sample) )
return false;
if ( ! context.getGenotype(sample).hasLikelihoods() )
return false;
}
return trios.size() > 0;
return true;
}
}

View File

@ -17,7 +17,7 @@ import java.util.*;
/**
* The u-based z-approximation from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele)
* Note that the mapping quality rank sum test can not be calculated for homozygous sites.
* Note that the mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
*/
public class MappingQualityRankSumTest extends RankSumTest implements StandardAnnotation {

View File

@ -55,7 +55,7 @@ public class MappingQualityZeroBySample extends GenotypeAnnotation {
final Genotype g,
final GenotypeBuilder gb,
final PerReadAlleleLikelihoodMap alleleLikelihoodMap){
if ( g == null || !g.isCalled() )
if ( g == null || !g.isCalled() || stratifiedContext == null )
return;
int mq0 = 0;

View File

@ -20,7 +20,7 @@ import java.util.*;
/**
* The u-based z-approximation from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele; if the alternate allele is only seen near the ends of reads this is indicative of error).
* Note that the read position rank sum test can not be calculated for homozygous sites.
* Note that the read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
*/
public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotation {

View File

@ -34,7 +34,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
@ -218,7 +217,10 @@ public class VariantAnnotatorEngine {
// go through all the requested info annotationTypes
for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) {
Map<String, Object> annotationsFromCurrentType = ((ActiveRegionBasedAnnotation)annotationType).annotate(perReadAlleleLikelihoodMap, vc);
if ( !(annotationType instanceof ActiveRegionBasedAnnotation) )
continue;
Map<String, Object> annotationsFromCurrentType = annotationType.annotate(perReadAlleleLikelihoodMap, vc);
if ( annotationsFromCurrentType != null ) {
infoAnnotations.putAll(annotationsFromCurrentType);
}
@ -298,16 +300,12 @@ public class VariantAnnotatorEngine {
if (stratifiedPerReadAlleleLikelihoodMap != null)
perReadAlleleLikelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName());
if ( context == null && perReadAlleleLikelihoodMap == null) {
// no likelihoods nor pileup available: just move on to next sample
genotypes.add(genotype);
} else {
final GenotypeBuilder gb = new GenotypeBuilder(genotype);
for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) {
annotation.annotate(tracker, walker, ref, context, vc, genotype, gb, perReadAlleleLikelihoodMap);
}
genotypes.add(gb.make());
final GenotypeBuilder gb = new GenotypeBuilder(genotype);
for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) {
annotation.annotate(tracker, walker, ref, context, vc, genotype, gb, perReadAlleleLikelihoodMap);
}
genotypes.add(gb.make());
}
return genotypes;

View File

@ -32,10 +32,9 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter;
import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.classloader.GATKLiteUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -46,6 +45,7 @@ import org.broadinstitute.sting.utils.recalibration.QuantizationInfo;
import org.broadinstitute.sting.utils.recalibration.RecalUtils;
import org.broadinstitute.sting.utils.recalibration.RecalibrationReport;
import org.broadinstitute.sting.utils.recalibration.RecalibrationTables;
import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
@ -56,7 +56,7 @@ import java.lang.reflect.Constructor;
import java.util.ArrayList;
/**
* First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as reported quality score, cycle, and dinucleotide).
* First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context).
*
* <p>
* This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating
@ -104,28 +104,28 @@ import java.util.ArrayList;
*/
@DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} )
@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN)
@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN)
@By(DataSource.READS)
@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class}) // only look at covered loci, not every loci of the reference file
@Requires({DataSource.READS, DataSource.REFERENCE}) // filter out all reads with zero or unavailable mapping quality
@PartitionBy(PartitionType.LOCUS) // this walker requires both -I input.bam and -R reference.fasta
public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeReducible<Long> {
@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class}) // only look at covered loci, not every loci of the reference file
@Requires({DataSource.READS, DataSource.REFERENCE}) // filter out all reads with zero or unavailable mapping quality
@PartitionBy(PartitionType.LOCUS) // this walker requires both -I input.bam and -R reference.fasta
public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeReducible<Long>, NanoSchedulable {
@ArgumentCollection
private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // all the command line arguments for BQSR and it's covariates
private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // all the command line arguments for BQSR and it's covariates
private QuantizationInfo quantizationInfo; // an object that keeps track of the information necessary for quality score quantization
private QuantizationInfo quantizationInfo; // an object that keeps track of the information necessary for quality score quantization
private RecalibrationTables recalibrationTables;
private Covariate[] requestedCovariates; // list to hold the all the covariate objects that were requested (required + standard + experimental)
private Covariate[] requestedCovariates; // list to hold the all the covariate objects that were requested (required + standard + experimental)
private RecalibrationEngine recalibrationEngine;
private int minimumQToUse;
protected static final String SKIP_RECORD_ATTRIBUTE = "SKIP"; // used to label reads that should be skipped.
protected static final String SEEN_ATTRIBUTE = "SEEN"; // used to label reads as processed.
protected static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\
protected static final String SKIP_RECORD_ATTRIBUTE = "SKIP"; // used to label reads that should be skipped.
protected static final String SEEN_ATTRIBUTE = "SEEN"; // used to label reads as processed.
protected static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\
private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation.";
@ -143,16 +143,16 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeRed
if (RAC.FORCE_PLATFORM != null)
RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM;
if (RAC.knownSites.isEmpty() && !RAC.RUN_WITHOUT_DBSNP) // Warn the user if no dbSNP file or other variant mask was specified
if (RAC.knownSites.isEmpty() && !RAC.RUN_WITHOUT_DBSNP) // Warn the user if no dbSNP file or other variant mask was specified
throw new UserException.CommandLineException(NO_DBSNP_EXCEPTION);
if (RAC.LIST_ONLY) {
RecalUtils.listAvailableCovariates(logger);
System.exit(0);
}
RAC.recalibrationReport = getToolkit().getArguments().BQSR_RECAL_FILE; // if we have a recalibration file, record it so it goes on the report table
RAC.recalibrationReport = getToolkit().getArguments().BQSR_RECAL_FILE; // if we have a recalibration file, record it so it goes on the report table
Pair<ArrayList<Covariate>, ArrayList<Covariate>> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates
Pair<ArrayList<Covariate>, ArrayList<Covariate>> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates
ArrayList<Covariate> requiredCovariates = covariates.getFirst();
ArrayList<Covariate> optionalCovariates = covariates.getSecond();
@ -164,9 +164,9 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeRed
requestedCovariates[covariateIndex++] = covariate;
logger.info("The covariates being used here: ");
for (Covariate cov : requestedCovariates) { // list all the covariates being used
for (Covariate cov : requestedCovariates) { // list all the covariates being used
logger.info("\t" + cov.getClass().getSimpleName());
cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection
cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection
}
int numReadGroups = 0;
@ -216,12 +216,14 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeRed
*/
public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
long countedSites = 0L;
if (tracker.getValues(RAC.knownSites).size() == 0) { // Only analyze sites not present in the provided known sites
// Only analyze sites not present in the provided known sites
if (tracker.getValues(RAC.knownSites).size() == 0) {
for (final PileupElement p : context.getBasePileup()) {
final GATKSAMRecord read = p.getRead();
final int offset = p.getOffset();
if (readHasBeenSkipped(read) || isLowQualityBase(read, offset)) // This read has been marked to be skipped or base is low quality (we don't recalibrate low quality bases)
// This read has been marked to be skipped or base is low quality (we don't recalibrate low quality bases)
if (readHasBeenSkipped(read) || isLowQualityBase(read, offset))
continue;
if (readNotSeen(read)) {
@ -234,10 +236,12 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeRed
read.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalUtils.computeCovariates(read, requestedCovariates));
}
if (!ReadUtils.isSOLiDRead(read) || // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
if (!ReadUtils.isSOLiDRead(read) ||
RAC.SOLID_RECAL_MODE == RecalUtils.SOLID_RECAL_MODE.DO_NOTHING ||
RecalUtils.isColorSpaceConsistent(read, offset))
recalibrationEngine.updateDataForPileupElement(p, ref.getBase()); // This base finally passed all the checks for a good base, so add it to the big data hashmap
// This base finally passed all the checks for a good base, so add it to the big data hashmap
recalibrationEngine.updateDataForPileupElement(p, ref.getBase());
}
countedSites++;
}

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.recalibration.RecalibrationTables;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
@ -34,4 +35,5 @@ public interface RecalibrationEngine {
public void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase);
public void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors);
}

Some files were not shown because too many files have changed in this diff Show More