Separated out the DoC calculations from the XHMM pipeline, so that CalcDepthOfCoverage can be used for calculating joint coverage on a per-base accounting over multiple samples (e.g., family samples)
This commit is contained in:
commit
449b89bd34
|
|
@ -18,3 +18,10 @@ queueScatterGather
|
|||
/bar*
|
||||
integrationtests/
|
||||
public/testdata/onTheFlyOutputTest.vcf
|
||||
private/testdata/onTheFlyOutputTest.vcf
|
||||
lib
|
||||
html
|
||||
gatkdocs
|
||||
dist
|
||||
build
|
||||
resources
|
||||
|
|
|
|||
|
|
@ -577,6 +577,7 @@
|
|||
docletpathref="doclet.classpath"
|
||||
classpathref="external.dependencies"
|
||||
classpath="${java.classes}"
|
||||
maxmemory="2g"
|
||||
additionalparam="-build-timestamp "${build.timestamp}" -absolute-version ${build.version} -out ${basedir}/${resource.path} -quiet">
|
||||
<sourcefiles>
|
||||
<union>
|
||||
|
|
@ -780,6 +781,7 @@
|
|||
docletpathref="doclet.classpath"
|
||||
classpathref="external.dependencies"
|
||||
classpath="${java.classes}"
|
||||
maxmemory="2g"
|
||||
additionalparam="${gatkdocs.include.hidden.arg} -private -build-timestamp "${build.timestamp}" -absolute-version ${build.version} -quiet"> <!-- -test to only do DocumentationTest walker -->
|
||||
<sourcefiles>
|
||||
<fileset refid="java.source.files"/>
|
||||
|
|
|
|||
|
|
@ -34,17 +34,20 @@ import org.broadinstitute.sting.utils.recalibration.EventType;
|
|||
import org.broadinstitute.sting.utils.recalibration.ReadCovariates;
|
||||
import org.broadinstitute.sting.utils.recalibration.RecalDatum;
|
||||
import org.broadinstitute.sting.utils.recalibration.RecalibrationTables;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine implements ProtectedPackageSource {
|
||||
|
||||
// optimizations: don't reallocate an array each time
|
||||
private byte[] tempQualArray;
|
||||
private boolean[] tempErrorArray;
|
||||
private double[] tempFractionalErrorArray;
|
||||
|
||||
public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables) {
|
||||
super.initialize(covariates, recalibrationTables);
|
||||
tempQualArray = new byte[EventType.values().length];
|
||||
tempErrorArray = new boolean[EventType.values().length];
|
||||
tempFractionalErrorArray = new double[EventType.values().length];
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -56,6 +59,7 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp
|
|||
* @param pileupElement The pileup element to update
|
||||
* @param refBase The reference base at this locus
|
||||
*/
|
||||
@Override
|
||||
public synchronized void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase) {
|
||||
final int offset = pileupElement.getOffset();
|
||||
final ReadCovariates readCovariates = covariateKeySetFrom(pileupElement.getRead());
|
||||
|
|
@ -76,7 +80,7 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp
|
|||
final NestedIntegerArray<RecalDatum> rgRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE);
|
||||
final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex);
|
||||
final RecalDatum rgThisDatum = createDatumObject(qual, isError);
|
||||
if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it
|
||||
if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it
|
||||
rgRecalTable.put(rgThisDatum, keys[0], eventIndex);
|
||||
else
|
||||
rgPreviousDatum.combine(rgThisDatum);
|
||||
|
|
@ -100,4 +104,53 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) {
|
||||
for( int offset = 0; offset < read.getReadBases().length; offset++ ) {
|
||||
if( !skip[offset] ) {
|
||||
final ReadCovariates readCovariates = covariateKeySetFrom(read);
|
||||
|
||||
tempQualArray[EventType.BASE_SUBSTITUTION.index] = read.getBaseQualities()[offset];
|
||||
tempFractionalErrorArray[EventType.BASE_SUBSTITUTION.index] = snpErrors[offset];
|
||||
tempQualArray[EventType.BASE_INSERTION.index] = read.getBaseInsertionQualities()[offset];
|
||||
tempFractionalErrorArray[EventType.BASE_INSERTION.index] = insertionErrors[offset];
|
||||
tempQualArray[EventType.BASE_DELETION.index] = read.getBaseDeletionQualities()[offset];
|
||||
tempFractionalErrorArray[EventType.BASE_DELETION.index] = deletionErrors[offset];
|
||||
|
||||
for (final EventType eventType : EventType.values()) {
|
||||
final int[] keys = readCovariates.getKeySet(offset, eventType);
|
||||
final int eventIndex = eventType.index;
|
||||
final byte qual = tempQualArray[eventIndex];
|
||||
final double isError = tempFractionalErrorArray[eventIndex];
|
||||
|
||||
final NestedIntegerArray<RecalDatum> rgRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE);
|
||||
final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex);
|
||||
final RecalDatum rgThisDatum = createDatumObject(qual, isError);
|
||||
if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it
|
||||
rgRecalTable.put(rgThisDatum, keys[0], eventIndex);
|
||||
else
|
||||
rgPreviousDatum.combine(rgThisDatum);
|
||||
|
||||
final NestedIntegerArray<RecalDatum> qualRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE);
|
||||
final RecalDatum qualPreviousDatum = qualRecalTable.get(keys[0], keys[1], eventIndex);
|
||||
if (qualPreviousDatum == null)
|
||||
qualRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], eventIndex);
|
||||
else
|
||||
qualPreviousDatum.increment(1.0, isError);
|
||||
|
||||
for (int i = 2; i < covariates.length; i++) {
|
||||
if (keys[i] < 0)
|
||||
continue;
|
||||
final NestedIntegerArray<RecalDatum> covRecalTable = recalibrationTables.getTable(i);
|
||||
final RecalDatum covPreviousDatum = covRecalTable.get(keys[0], keys[1], keys[i], eventIndex);
|
||||
if (covPreviousDatum == null)
|
||||
covRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], keys[i], eventIndex);
|
||||
else
|
||||
covPreviousDatum.increment(1.0, isError);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
|||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.filters.*;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.PartitionBy;
|
||||
import org.broadinstitute.sting.gatk.walkers.PartitionType;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
|
||||
|
|
@ -247,7 +247,7 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
* @return a linked list with all the reads produced by the clipping operations
|
||||
*/
|
||||
@Override
|
||||
public LinkedList<GATKSAMRecord> map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
public LinkedList<GATKSAMRecord> map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) {
|
||||
LinkedList<GATKSAMRecord> mappedReads;
|
||||
totalReads++;
|
||||
if (!debugRead.isEmpty() && read.getReadName().contains(debugRead))
|
||||
|
|
|
|||
|
|
@ -546,7 +546,7 @@ public class SlidingWindow {
|
|||
|
||||
FractionalDownsampler <GATKSAMRecord> downsampler = new FractionalDownsampler<GATKSAMRecord>(fraction);
|
||||
downsampler.submit(allReads);
|
||||
return downsampler.consumeDownsampledItems();
|
||||
return downsampler.consumeFinalizedItems();
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -52,7 +52,11 @@ public class GenotypingEngine {
|
|||
noCall.add(Allele.NO_CALL);
|
||||
}
|
||||
|
||||
// This function is the streamlined approach, currently not being used
|
||||
// WARN
|
||||
// This function is the streamlined approach, currently not being used by default
|
||||
// WARN
|
||||
// WARN: This function is currently only being used by Menachem. Slated for removal/merging with the rest of the code.
|
||||
// WARN
|
||||
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
|
||||
public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine,
|
||||
final ArrayList<Haplotype> haplotypes,
|
||||
|
|
@ -184,6 +188,7 @@ public class GenotypingEngine {
|
|||
return returnCalls;
|
||||
}
|
||||
|
||||
// BUGBUG: Create a class to hold this complicated return type
|
||||
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
|
||||
public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine,
|
||||
final ArrayList<Haplotype> haplotypes,
|
||||
|
|
@ -210,13 +215,8 @@ public class GenotypingEngine {
|
|||
System.out.println( ">> Events = " + h.getEventMap());
|
||||
}
|
||||
}
|
||||
// Create the VC merge priority list
|
||||
final ArrayList<String> priorityList = new ArrayList<String>();
|
||||
for( int iii = 0; iii < haplotypes.size(); iii++ ) {
|
||||
priorityList.add("HC" + iii);
|
||||
}
|
||||
|
||||
cleanUpSymbolicUnassembledEvents( haplotypes, priorityList );
|
||||
cleanUpSymbolicUnassembledEvents( haplotypes );
|
||||
if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 3 ) { // if not in GGA mode and have at least 3 samples try to create MNP and complex events by looking at LD structure
|
||||
mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc );
|
||||
}
|
||||
|
|
@ -229,13 +229,16 @@ public class GenotypingEngine {
|
|||
// Walk along each position in the key set and create each event to be outputted
|
||||
for( final int loc : startPosKeySet ) {
|
||||
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) {
|
||||
final ArrayList<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>();
|
||||
final ArrayList<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>(); // the overlapping events to merge into a common reference view
|
||||
final ArrayList<String> priorityList = new ArrayList<String>(); // used to merge overlapping events into common reference view
|
||||
|
||||
if( activeAllelesToGenotype.isEmpty() ) {
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final HashMap<Integer,VariantContext> eventMap = h.getEventMap();
|
||||
final VariantContext vc = eventMap.get(loc);
|
||||
if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) {
|
||||
eventsAtThisLoc.add(vc);
|
||||
priorityList.add(vc.getSource());
|
||||
}
|
||||
}
|
||||
} else { // we are in GGA mode!
|
||||
|
|
@ -260,6 +263,22 @@ public class GenotypingEngine {
|
|||
// Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event
|
||||
final ArrayList<ArrayList<Haplotype>> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
|
||||
|
||||
// Sanity check the priority list
|
||||
for( final VariantContext vc : eventsAtThisLoc ) {
|
||||
if( !priorityList.contains(vc.getSource()) ) {
|
||||
throw new ReviewedStingException("Event found on haplotype that wasn't added to priority list. Something went wrong in the merging of alleles.");
|
||||
}
|
||||
}
|
||||
for( final String name : priorityList ) {
|
||||
boolean found = false;
|
||||
for( final VariantContext vc : eventsAtThisLoc ) {
|
||||
if(vc.getSource().equals(name)) { found = true; break; }
|
||||
}
|
||||
if( !found ) {
|
||||
throw new ReviewedStingException("Event added to priority list but wasn't found on any haplotype. Something went wrong in the merging of alleles.");
|
||||
}
|
||||
}
|
||||
|
||||
// Merge the event to find a common reference representation
|
||||
final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
|
||||
if( mergedVC == null ) { continue; }
|
||||
|
|
@ -299,9 +318,8 @@ public class GenotypingEngine {
|
|||
return returnCalls;
|
||||
}
|
||||
|
||||
protected static void cleanUpSymbolicUnassembledEvents( final ArrayList<Haplotype> haplotypes, final ArrayList<String> priorityList ) {
|
||||
protected static void cleanUpSymbolicUnassembledEvents( final ArrayList<Haplotype> haplotypes ) {
|
||||
final ArrayList<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
|
||||
final ArrayList<String> stringsToRemove = new ArrayList<String>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
for( final VariantContext vc : h.getEventMap().values() ) {
|
||||
if( vc.isSymbolic() ) {
|
||||
|
|
@ -309,7 +327,6 @@ public class GenotypingEngine {
|
|||
for( final VariantContext vc2 : h2.getEventMap().values() ) {
|
||||
if( vc.getStart() == vc2.getStart() && vc2.isIndel() ) {
|
||||
haplotypesToRemove.add(h);
|
||||
stringsToRemove.add(vc.getSource());
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
@ -318,7 +335,6 @@ public class GenotypingEngine {
|
|||
}
|
||||
}
|
||||
haplotypes.removeAll(haplotypesToRemove);
|
||||
priorityList.removeAll(stringsToRemove);
|
||||
}
|
||||
|
||||
protected void mergeConsecutiveEventsBasedOnLD( final ArrayList<Haplotype> haplotypes, final TreeSet<Integer> startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) {
|
||||
|
|
|
|||
|
|
@ -27,26 +27,23 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
|||
|
||||
import com.google.java.contract.Ensures;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.*;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension;
|
||||
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.PartitionBy;
|
||||
import org.broadinstitute.sting.gatk.walkers.PartitionType;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.*;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
|
@ -54,6 +51,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
|
|||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
|
||||
import org.broadinstitute.sting.utils.fragments.FragmentUtils;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
|
@ -103,6 +101,7 @@ import java.util.*;
|
|||
|
||||
@DocumentedGATKFeature( groupName = "Variant Discovery Tools", extraDocs = {CommandLineGATK.class} )
|
||||
@PartitionBy(PartitionType.LOCUS)
|
||||
@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN)
|
||||
@ActiveRegionExtension(extension=65, maxRegion=300)
|
||||
public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implements AnnotatorCompatible {
|
||||
|
||||
|
|
@ -309,7 +308,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
for( final VariantContext vc : tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()) ) {
|
||||
if( !allelesToGenotype.contains(vc) ) {
|
||||
allelesToGenotype.add(vc); // save for later for processing during the ActiveRegion's map call. Should be folded into a ReadMetaDataTracker object
|
||||
allelesToGenotype.add(vc); // save for later for processing during the ActiveRegion's map call. Should be folded into a RefMetaDataTracker object
|
||||
}
|
||||
}
|
||||
if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) {
|
||||
|
|
|
|||
|
|
@ -127,9 +127,9 @@ public class BQSRIntegrationTest extends WalkerTest {
|
|||
@DataProvider(name = "PRTest")
|
||||
public Object[][] createPRTestData() {
|
||||
return new Object[][]{
|
||||
{new PRTest("", "d2d6ed8667cdba7e56f5db97d6262676")},
|
||||
{new PRTest(" -qq -1", "b7053d3d67aba6d8892f0a60f0ded338")},
|
||||
{new PRTest(" -qq 6", "bfbf0855185b2b70aa35237fb71e4487")},
|
||||
{new PRTest("", "1532242f9fe90ef759a0faa5d85f61fb")},
|
||||
{new PRTest(" -qq -1", "3dd2c87915c96ac55c3872026574d8cb")},
|
||||
{new PRTest(" -qq 6", "5d012ee224f1cb4a7afac59e3655e20c")},
|
||||
{new PRTest(" -DIQ", "66aa65223f192ee39c1773aa187fd493")}
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,12 +47,12 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test(enabled = true)
|
||||
public void testBOTH_GGA_Pools() {
|
||||
PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","077db83cf7dc5490f670c85856b408b2");
|
||||
PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","0ff90fa3882a3fb5089a7bba50dd8ae3");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testINDEL_GGA_Pools() {
|
||||
PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","e460a17377b731ff4eab36fb56042ecd");
|
||||
PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","90af837f372e3d5143af30bf5c8c2b75");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
|
|
@ -67,11 +67,11 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test(enabled = true)
|
||||
public void testMT_SNP_DISCOVERY_sp4() {
|
||||
PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","da359fe7dd6dce045193198c264301ee");
|
||||
PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","c32e10070e10d30d33e5b882c1f89413");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMT_SNP_GGA_sp10() {
|
||||
PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "ad0eef3a9deaa098d79df62af7e5448a");
|
||||
PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "4d16d3c9475637bad70e9dc2eafe2da2");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -66,4 +66,11 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
|
||||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "e1f88fac91424740c0eaac1de48b3970");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void HCTestProblematicReadsModifiedInActiveRegions() {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("000fd36d5cf8090386bb2ac15e3ab0b5"));
|
||||
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
library(gplots)
|
||||
library(ggplot2)
|
||||
library(tools)
|
||||
|
||||
# -------------------------------------------------------
|
||||
# Utilities for displaying multiple plots per page
|
||||
|
|
@ -59,6 +60,7 @@ closePDF <- function(outputPDF) {
|
|||
if ( ! is.na(outputPDF) ) {
|
||||
dev.off()
|
||||
if (exists("compactPDF")) {
|
||||
print("compacting PDF")
|
||||
compactPDF(outputPDF)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -245,7 +245,7 @@ public class FastaSequenceIndexBuilder {
|
|||
* Reset iterators and add contig to sequence index
|
||||
*/
|
||||
private void finishReadingContig(FastaSequenceIndex sequenceIndex) {
|
||||
sequenceIndex.add(new FastaSequenceIndexEntry(contig, location, size, (int) basesPerLine, (int) bytesPerLine, thisSequenceIndex++));
|
||||
sequenceIndex.add(new FastaSequenceIndexEntry(trimContigName(contig), location, size, (int) basesPerLine, (int) bytesPerLine, thisSequenceIndex++));
|
||||
status = Status.NONE;
|
||||
contig = "";
|
||||
size = 0;
|
||||
|
|
@ -258,6 +258,14 @@ public class FastaSequenceIndexBuilder {
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Trims the contig name to the expected value by removing any characters after the first whitespace
|
||||
*/
|
||||
private static String trimContigName(final String contigName) {
|
||||
int whitespaceIndex = contigName.indexOf(' ');
|
||||
return ( whitespaceIndex == -1 ) ? contigName : contigName.substring(0, whitespaceIndex);
|
||||
}
|
||||
|
||||
/**
|
||||
* Stores FastaSequenceIndex as a .fasta.fai file on local machine
|
||||
* Although method is public it cannot be called on any old FastaSequenceIndex - must be created by a FastaSequenceIndexBuilder
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
|
|||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
|
@ -81,7 +81,7 @@ public class AlignmentValidation extends ReadWalker<Integer,Integer> {
|
|||
* @return Number of reads aligned by this map (aka 1).
|
||||
*/
|
||||
@Override
|
||||
public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) {
|
||||
//logger.info(String.format("examining read %s", read.getReadName()));
|
||||
|
||||
byte[] bases = read.getReadBases();
|
||||
|
|
|
|||
|
|
@ -1,139 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.alignment;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
|
||||
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
|
||||
import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Aligns reads to a given reference using Heng Li's BWA aligner, presenting the resulting alignments in SAM or BAM format.
|
||||
* Mimics the steps 'bwa aln' followed by 'bwa samse' using the BWA/C implementation.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
@DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} )
|
||||
@WalkerName("Align")
|
||||
public class AlignmentWalker extends ReadWalker<Integer,Integer> {
|
||||
@Argument(fullName="target_reference",shortName="target_ref",doc="The reference to which reads in the source file should be aligned. Alongside this reference should sit index files " +
|
||||
"generated by bwa index -d bwtsw. If unspecified, will default " +
|
||||
"to the reference specified via the -R argument.",required=false)
|
||||
private File targetReferenceFile = null;
|
||||
|
||||
@Output
|
||||
private StingSAMFileWriter out = null;
|
||||
|
||||
/**
|
||||
* The actual aligner.
|
||||
*/
|
||||
private BWACAligner aligner = null;
|
||||
|
||||
/**
|
||||
* New header to use, if desired.
|
||||
*/
|
||||
private SAMFileHeader header;
|
||||
|
||||
/**
|
||||
* Create an aligner object. The aligner object will load and hold the BWT until close() is called.
|
||||
*/
|
||||
@Override
|
||||
public void initialize() {
|
||||
if(targetReferenceFile == null)
|
||||
targetReferenceFile = getToolkit().getArguments().referenceFile;
|
||||
BWTFiles bwtFiles = new BWTFiles(targetReferenceFile.getAbsolutePath());
|
||||
BWAConfiguration configuration = new BWAConfiguration();
|
||||
aligner = new BWACAligner(bwtFiles,configuration);
|
||||
|
||||
// Take the header of the SAM file, tweak it by adding in the reference dictionary and specifying that the target file is unsorted.
|
||||
header = getToolkit().getSAMFileHeader().clone();
|
||||
SAMSequenceDictionary referenceDictionary =
|
||||
ReferenceSequenceFileFactory.getReferenceSequenceFile(targetReferenceFile).getSequenceDictionary();
|
||||
header.setSequenceDictionary(referenceDictionary);
|
||||
header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
|
||||
|
||||
out.writeHeader(header);
|
||||
}
|
||||
|
||||
/**
|
||||
* Aligns a read to the given reference.
|
||||
*
|
||||
* @param ref Reference over the read. Read will most likely be unmapped, so ref will be null.
|
||||
* @param read Read to align.
|
||||
* @return Number of alignments found for this read.
|
||||
*/
|
||||
@Override
|
||||
public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
SAMRecord alignedRead = aligner.align(read,header);
|
||||
out.addAlignment(alignedRead);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initial value for reduce. In this case, alignments will be counted.
|
||||
* @return 0, indicating no alignments yet found.
|
||||
*/
|
||||
@Override
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
/**
|
||||
* Calculates the number of alignments found.
|
||||
* @param value Number of alignments found by this map.
|
||||
* @param sum Number of alignments found before this map.
|
||||
* @return Number of alignments found up to and including this map.
|
||||
*/
|
||||
@Override
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return value + sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cleanup.
|
||||
* @param result Number of reads processed.
|
||||
*/
|
||||
@Override
|
||||
public void onTraversalDone(Integer result) {
|
||||
aligner.close();
|
||||
super.onTraversalDone(result);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,132 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.alignment;
|
||||
|
||||
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
|
||||
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
|
||||
import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.SortedMap;
|
||||
import java.util.TreeMap;
|
||||
|
||||
/**
|
||||
* Counts the number of best alignments as presented by BWA and outputs a histogram of number of placements vs. the
|
||||
* frequency of that number of placements.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
@DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} )
|
||||
public class CountBestAlignments extends ReadWalker<Integer,Integer> {
|
||||
/**
|
||||
* The supporting BWT index generated using BWT.
|
||||
*/
|
||||
@Argument(fullName="BWTPrefix",shortName="BWT",doc="Index files generated by bwa index -d bwtsw",required=false)
|
||||
private String prefix = null;
|
||||
|
||||
@Output
|
||||
private PrintStream out = null;
|
||||
|
||||
/**
|
||||
* The actual aligner.
|
||||
*/
|
||||
private Aligner aligner = null;
|
||||
|
||||
private SortedMap<Integer,Integer> alignmentFrequencies = new TreeMap<Integer,Integer>();
|
||||
|
||||
/**
|
||||
* Create an aligner object. The aligner object will load and hold the BWT until close() is called.
|
||||
*/
|
||||
@Override
|
||||
public void initialize() {
|
||||
if(prefix == null)
|
||||
prefix = getToolkit().getArguments().referenceFile.getAbsolutePath();
|
||||
BWTFiles bwtFiles = new BWTFiles(prefix);
|
||||
BWAConfiguration configuration = new BWAConfiguration();
|
||||
aligner = new BWACAligner(bwtFiles,configuration);
|
||||
}
|
||||
|
||||
/**
|
||||
* Aligns a read to the given reference.
|
||||
*
|
||||
* @param ref Reference over the read. Read will most likely be unmapped, so ref will be null.
|
||||
* @param read Read to align.
|
||||
* @return Number of alignments found for this read.
|
||||
*/
|
||||
@Override
|
||||
public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
Iterator<Alignment[]> alignmentIterator = aligner.getAllAlignments(read.getReadBases()).iterator();
|
||||
if(alignmentIterator.hasNext()) {
|
||||
int numAlignments = alignmentIterator.next().length;
|
||||
if(alignmentFrequencies.containsKey(numAlignments))
|
||||
alignmentFrequencies.put(numAlignments,alignmentFrequencies.get(numAlignments)+1);
|
||||
else
|
||||
alignmentFrequencies.put(numAlignments,1);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initial value for reduce. In this case, validated reads will be counted.
|
||||
* @return 0, indicating no reads yet validated.
|
||||
*/
|
||||
@Override
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
/**
|
||||
* Calculates the number of reads processed.
|
||||
* @param value Number of reads processed by this map.
|
||||
* @param sum Number of reads processed before this map.
|
||||
* @return Number of reads processed up to and including this map.
|
||||
*/
|
||||
@Override
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return value + sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cleanup.
|
||||
* @param result Number of reads processed.
|
||||
*/
|
||||
@Override
|
||||
public void onTraversalDone(Integer result) {
|
||||
aligner.close();
|
||||
for(Map.Entry<Integer,Integer> alignmentFrequency: alignmentFrequencies.entrySet())
|
||||
out.printf("%d\t%d%n", alignmentFrequency.getKey(), alignmentFrequency.getValue());
|
||||
super.onTraversalDone(result);
|
||||
}
|
||||
}
|
||||
|
|
@ -117,6 +117,15 @@ public final class RodBinding<T extends Feature> {
|
|||
this.bound = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* For testing purposes only. Creates a RodBinding sufficient for looking up associations to rawName
|
||||
* @param type
|
||||
* @param rawName
|
||||
*/
|
||||
public RodBinding(Class<T> type, final String rawName) {
|
||||
this(type, rawName, "missing", type.getSimpleName(), new Tags());
|
||||
}
|
||||
|
||||
/**
|
||||
* Make an unbound RodBinding<T>. Only available for creating the globally unique UNBOUND object
|
||||
* @param type class this unbound RodBinding creates
|
||||
|
|
|
|||
|
|
@ -112,31 +112,35 @@ public class CommandLineGATK extends CommandLineExecutable {
|
|||
}
|
||||
}
|
||||
|
||||
protected static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file";
|
||||
protected static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files";
|
||||
public static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file";
|
||||
public static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files";
|
||||
private static void checkForMaskedUserErrors(final Throwable t) {
|
||||
final String message = t.getMessage();
|
||||
if ( message == null )
|
||||
return;
|
||||
|
||||
// we know what to do about the common "Too many open files" error
|
||||
if ( message.indexOf("Too many open files") != -1 )
|
||||
if ( message.contains("Too many open files") )
|
||||
exitSystemWithUserError(new UserException.TooManyOpenFiles());
|
||||
|
||||
// malformed BAM looks like a SAM file
|
||||
if ( message.indexOf(PICARD_TEXT_SAM_FILE_ERROR_1) != -1 ||
|
||||
message.indexOf(PICARD_TEXT_SAM_FILE_ERROR_2) != -1 )
|
||||
if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) ||
|
||||
message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) )
|
||||
exitSystemWithSamError(t);
|
||||
|
||||
// can't close tribble index when writing
|
||||
if ( message.indexOf("Unable to close index for") != -1 )
|
||||
if ( message.contains("Unable to close index for") )
|
||||
exitSystemWithUserError(new UserException(t.getCause() == null ? message : t.getCause().getMessage()));
|
||||
|
||||
// disk is full
|
||||
if ( message.indexOf("No space left on device") != -1 )
|
||||
exitSystemWithUserError(new UserException(t.getMessage()));
|
||||
if ( t.getCause() != null && t.getCause().getMessage().indexOf("No space left on device") != -1 )
|
||||
exitSystemWithUserError(new UserException(t.getCause().getMessage()));
|
||||
if ( message.contains("No space left on device") )
|
||||
exitSystemWithUserError(new UserException.NoSpaceOnDevice());
|
||||
if ( t.getCause() != null && t.getCause().getMessage().contains("No space left on device") )
|
||||
exitSystemWithUserError(new UserException.NoSpaceOnDevice());
|
||||
|
||||
// masked out of memory error
|
||||
if ( t.getCause() != null && t.getCause() instanceof OutOfMemoryError )
|
||||
exitSystemWithUserError(new UserException.NotEnoughMemory());
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -1,52 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
/**
|
||||
* Describes the method for downsampling reads at a given locus.
|
||||
*
|
||||
* @author hanna
|
||||
* @version 0.1
|
||||
*/
|
||||
|
||||
public class DownsamplingMethod {
|
||||
/**
|
||||
* Type of downsampling to perform.
|
||||
*/
|
||||
public final DownsampleType type;
|
||||
|
||||
/**
|
||||
* Actual downsampling target is specified as an integer number of reads.
|
||||
*/
|
||||
public final Integer toCoverage;
|
||||
|
||||
/**
|
||||
* Actual downsampling target is specified as a fraction of total available reads.
|
||||
*/
|
||||
public final Double toFraction;
|
||||
|
||||
/**
|
||||
* Expresses no downsampling applied at all.
|
||||
*/
|
||||
public static final DownsamplingMethod NONE = new DownsamplingMethod(DownsampleType.NONE,null,null);
|
||||
|
||||
public DownsamplingMethod(DownsampleType type, Integer toCoverage, Double toFraction) {
|
||||
// Do some basic sanity checks on the downsampling parameters passed in.
|
||||
|
||||
// Can't leave toFraction and toCoverage null unless type is experimental naive duplicate eliminator.
|
||||
if(type != DownsampleType.NONE && toFraction == null && toCoverage == null)
|
||||
throw new UserException.CommandLineException("Must specify either toFraction or toCoverage when downsampling.");
|
||||
|
||||
// Fraction and coverage cannot both be specified.
|
||||
if(toFraction != null && toCoverage != null)
|
||||
throw new UserException.CommandLineException("Downsampling coverage and fraction are both specified. Please choose only one.");
|
||||
|
||||
// Experimental by sample downsampling does not work with a fraction of reads.
|
||||
if(type == DownsampleType.BY_SAMPLE && toFraction != null)
|
||||
throw new UserException.CommandLineException("Cannot downsample to fraction with new EXPERIMENTAL_BY_SAMPLE method");
|
||||
|
||||
this.type = type;
|
||||
this.toCoverage = toCoverage;
|
||||
this.toFraction = toFraction;
|
||||
}
|
||||
}
|
||||
|
|
@ -30,19 +30,21 @@ import net.sf.samtools.SAMFileHeader;
|
|||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.readers.PositionalBufferedStream;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.*;
|
||||
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
|
||||
import org.broadinstitute.sting.gatk.filters.FilterManager;
|
||||
import org.broadinstitute.sting.gatk.filters.ReadFilter;
|
||||
import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter;
|
||||
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||
import org.broadinstitute.sting.gatk.io.stubs.Stub;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
|
||||
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
||||
|
|
@ -50,20 +52,16 @@ import org.broadinstitute.sting.gatk.samples.SampleDB;
|
|||
import org.broadinstitute.sting.gatk.samples.SampleDBBuilder;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.classloader.GATKLiteUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
|
||||
import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -136,11 +134,18 @@ public class GenomeAnalysisEngine {
|
|||
*/
|
||||
private Collection<ReadFilter> filters;
|
||||
|
||||
/**
|
||||
* Collection of the read transformers applied to the reads
|
||||
*/
|
||||
private List<ReadTransformer> readTransformers;
|
||||
|
||||
/**
|
||||
* Controls the allocation of threads between CPU vs IO.
|
||||
*/
|
||||
private ThreadAllocation threadAllocation;
|
||||
|
||||
private ReadMetrics cumulativeMetrics = null;
|
||||
|
||||
/**
|
||||
* A currently hacky unique name for this GATK instance
|
||||
*/
|
||||
|
|
@ -175,6 +180,13 @@ public class GenomeAnalysisEngine {
|
|||
*/
|
||||
private Collection<RMDTriplet> referenceMetaDataFiles;
|
||||
|
||||
/**
|
||||
* The threading efficiency monitor we use in the GATK to monitor our efficiency.
|
||||
*
|
||||
* May be null if one isn't active, or hasn't be initialized yet
|
||||
*/
|
||||
private ThreadEfficiencyMonitor threadEfficiencyMonitor = null;
|
||||
|
||||
/**
|
||||
* Set the reference metadata files to use for this traversal.
|
||||
* @param referenceMetaDataFiles Collection of files and descriptors over which to traverse.
|
||||
|
|
@ -252,6 +264,7 @@ public class GenomeAnalysisEngine {
|
|||
|
||||
// our microscheduler, which is in charge of running everything
|
||||
MicroScheduler microScheduler = createMicroscheduler();
|
||||
threadEfficiencyMonitor = microScheduler.getThreadEfficiencyMonitor();
|
||||
|
||||
// create temp directories as necessary
|
||||
initializeTempDirectory();
|
||||
|
|
@ -280,6 +293,8 @@ public class GenomeAnalysisEngine {
|
|||
static {
|
||||
deprecatedGATKWalkers.put("CountCovariates", "2.0");
|
||||
deprecatedGATKWalkers.put("TableRecalibration", "2.0");
|
||||
deprecatedGATKWalkers.put("AlignmentWalker", "2.2");
|
||||
deprecatedGATKWalkers.put("CountBestAlignments", "2.2");
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -349,32 +364,59 @@ public class GenomeAnalysisEngine {
|
|||
return Collections.unmodifiableList(filters);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a list of active, initialized read transformers
|
||||
*
|
||||
* @param walker the walker we need to apply read transformers too
|
||||
* @return a non-null list of read transformers
|
||||
*/
|
||||
public void initializeReadTransformers(final Walker walker) {
|
||||
final List<ReadTransformer> activeTransformers = new ArrayList<ReadTransformer>();
|
||||
|
||||
final ReadTransformersMode overrideMode = WalkerManager.getWalkerAnnotation(walker, ReadTransformersMode.class);
|
||||
final ReadTransformer.ApplicationTime overrideTime = overrideMode != null ? overrideMode.ApplicationTime() : null;
|
||||
|
||||
final PluginManager<ReadTransformer> pluginManager = new PluginManager<ReadTransformer>(ReadTransformer.class);
|
||||
|
||||
for ( final ReadTransformer transformer : pluginManager.createAllTypes() ) {
|
||||
transformer.initialize(overrideTime, this, walker);
|
||||
if ( transformer.enabled() )
|
||||
activeTransformers.add(transformer);
|
||||
}
|
||||
|
||||
setReadTransformers(activeTransformers);
|
||||
}
|
||||
|
||||
public List<ReadTransformer> getReadTransformers() {
|
||||
return readTransformers;
|
||||
}
|
||||
|
||||
private void setReadTransformers(final List<ReadTransformer> readTransformers) {
|
||||
if ( readTransformers == null )
|
||||
throw new ReviewedStingException("read transformers cannot be null");
|
||||
this.readTransformers = readTransformers;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse out the thread allocation from the given command-line argument.
|
||||
*/
|
||||
private void determineThreadAllocation() {
|
||||
Tags tags = parsingEngine.getTags(argCollection.numberOfThreads);
|
||||
if ( argCollection.numberOfDataThreads < 1 ) throw new UserException.BadArgumentValue("num_threads", "cannot be less than 1, but saw " + argCollection.numberOfDataThreads);
|
||||
if ( argCollection.numberOfCPUThreadsPerDataThread < 1 ) throw new UserException.BadArgumentValue("num_cpu_threads", "cannot be less than 1, but saw " + argCollection.numberOfCPUThreadsPerDataThread);
|
||||
if ( argCollection.numberOfIOThreads < 0 ) throw new UserException.BadArgumentValue("num_io_threads", "cannot be less than 0, but saw " + argCollection.numberOfIOThreads);
|
||||
|
||||
// TODO: Kill this complicated logic once Queue supports arbitrary tagged parameters.
|
||||
Integer numCPUThreads = null;
|
||||
if(tags.containsKey("cpu") && argCollection.numberOfCPUThreads != null)
|
||||
throw new UserException("Number of CPU threads specified both directly on the command-line and as a tag to the nt argument. Please specify only one or the other.");
|
||||
else if(tags.containsKey("cpu"))
|
||||
numCPUThreads = Integer.parseInt(tags.getValue("cpu"));
|
||||
else if(argCollection.numberOfCPUThreads != null)
|
||||
numCPUThreads = argCollection.numberOfCPUThreads;
|
||||
|
||||
Integer numIOThreads = null;
|
||||
if(tags.containsKey("io") && argCollection.numberOfIOThreads != null)
|
||||
throw new UserException("Number of IO threads specified both directly on the command-line and as a tag to the nt argument. Please specify only one or the other.");
|
||||
else if(tags.containsKey("io"))
|
||||
numIOThreads = Integer.parseInt(tags.getValue("io"));
|
||||
else if(argCollection.numberOfIOThreads != null)
|
||||
numIOThreads = argCollection.numberOfIOThreads;
|
||||
|
||||
this.threadAllocation = new ThreadAllocation(argCollection.numberOfThreads,numCPUThreads,numIOThreads);
|
||||
this.threadAllocation = new ThreadAllocation(argCollection.numberOfDataThreads,
|
||||
argCollection.numberOfCPUThreadsPerDataThread,
|
||||
argCollection.numberOfIOThreads,
|
||||
! argCollection.disableEfficiencyMonitor);
|
||||
}
|
||||
|
||||
public int getTotalNumberOfThreads() {
|
||||
return this.threadAllocation == null ? 1 : threadAllocation.getTotalNumThreads();
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Allow subclasses and others within this package direct access to the walker manager.
|
||||
* @return The walker manager used by this package.
|
||||
|
|
@ -400,23 +442,24 @@ public class GenomeAnalysisEngine {
|
|||
|
||||
protected DownsamplingMethod getDownsamplingMethod() {
|
||||
GATKArgumentCollection argCollection = this.getArguments();
|
||||
DownsamplingMethod method;
|
||||
if(argCollection.getDownsamplingMethod() != null)
|
||||
method = argCollection.getDownsamplingMethod();
|
||||
else if(WalkerManager.getDownsamplingMethod(walker) != null)
|
||||
method = WalkerManager.getDownsamplingMethod(walker);
|
||||
else
|
||||
method = GATKArgumentCollection.getDefaultDownsamplingMethod();
|
||||
return method;
|
||||
boolean useExperimentalDownsampling = argCollection.enableExperimentalDownsampling;
|
||||
|
||||
// until the file pointer bug with the experimental downsamplers is fixed, disallow running with experimental downsampling
|
||||
if ( useExperimentalDownsampling ) {
|
||||
throw new UserException("The experimental downsampling implementation is currently crippled by a file-pointer-related bug. Until this bug is fixed, it's not safe (or possible) for anyone to use the experimental implementation!");
|
||||
}
|
||||
|
||||
DownsamplingMethod commandLineMethod = argCollection.getDownsamplingMethod();
|
||||
DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker, useExperimentalDownsampling);
|
||||
DownsamplingMethod defaultMethod = DownsamplingMethod.getDefaultDownsamplingMethod(walker, useExperimentalDownsampling);
|
||||
|
||||
return commandLineMethod != null ? commandLineMethod : (walkerMethod != null ? walkerMethod : defaultMethod);
|
||||
}
|
||||
|
||||
protected void setDownsamplingMethod(DownsamplingMethod method) {
|
||||
argCollection.setDownsamplingMethod(method);
|
||||
}
|
||||
|
||||
public BAQ.QualityMode getWalkerBAQQualityMode() { return WalkerManager.getBAQQualityMode(walker); }
|
||||
public BAQ.ApplicationTime getWalkerBAQApplicationTime() { return WalkerManager.getBAQApplicationTime(walker); }
|
||||
|
||||
protected boolean includeReadsWithDeletionAtLoci() {
|
||||
return walker.includeReadsWithDeletionAtLoci();
|
||||
}
|
||||
|
|
@ -697,13 +740,12 @@ public class GenomeAnalysisEngine {
|
|||
protected void initializeDataSources() {
|
||||
logger.info("Strictness is " + argCollection.strictnessLevel);
|
||||
|
||||
// TODO -- REMOVE ME
|
||||
BAQ.DEFAULT_GOP = argCollection.BAQGOP;
|
||||
|
||||
validateSuppliedReference();
|
||||
setReferenceDataSource(argCollection.referenceFile);
|
||||
|
||||
validateSuppliedReads();
|
||||
initializeReadTransformers(walker);
|
||||
|
||||
readsDataSource = createReadsDataSource(argCollection,genomeLocParser,referenceDataSource.getReference());
|
||||
|
||||
for (ReadFilter filter : filters)
|
||||
|
|
@ -784,14 +826,13 @@ public class GenomeAnalysisEngine {
|
|||
* @return A data source for the given set of reads.
|
||||
*/
|
||||
private SAMDataSource createReadsDataSource(GATKArgumentCollection argCollection, GenomeLocParser genomeLocParser, IndexedFastaSequenceFile refReader) {
|
||||
DownsamplingMethod method = getDownsamplingMethod();
|
||||
DownsamplingMethod downsamplingMethod = getDownsamplingMethod();
|
||||
|
||||
// Synchronize the method back into the collection so that it shows up when
|
||||
// interrogating for the downsample method during command line recreation.
|
||||
setDownsamplingMethod(method);
|
||||
setDownsamplingMethod(downsamplingMethod);
|
||||
|
||||
if ( getWalkerBAQApplicationTime() == BAQ.ApplicationTime.FORBIDDEN && argCollection.BAQMode != BAQ.CalculationMode.OFF)
|
||||
throw new UserException.BadArgumentValue("baq", "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + argCollection.BAQMode + " was requested.");
|
||||
logger.info(downsamplingMethod);
|
||||
|
||||
if (argCollection.removeProgramRecords && argCollection.keepProgramRecords)
|
||||
throw new UserException.BadArgumentValue("rpr / kpr", "Cannot enable both options");
|
||||
|
|
@ -809,14 +850,11 @@ public class GenomeAnalysisEngine {
|
|||
argCollection.useOriginalBaseQualities,
|
||||
argCollection.strictnessLevel,
|
||||
argCollection.readBufferSize,
|
||||
method,
|
||||
downsamplingMethod,
|
||||
new ValidationExclusion(Arrays.asList(argCollection.unsafe)),
|
||||
filters,
|
||||
readTransformers,
|
||||
includeReadsWithDeletionAtLoci(),
|
||||
getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF,
|
||||
getWalkerBAQQualityMode(),
|
||||
refReader,
|
||||
getBaseRecalibration(),
|
||||
argCollection.defaultBaseQualities,
|
||||
removeProgramRecords);
|
||||
}
|
||||
|
|
@ -1000,7 +1038,19 @@ public class GenomeAnalysisEngine {
|
|||
* owned by the caller; the caller can do with the object what they wish.
|
||||
*/
|
||||
public ReadMetrics getCumulativeMetrics() {
|
||||
return readsDataSource == null ? null : readsDataSource.getCumulativeReadMetrics();
|
||||
// todo -- probably shouldn't be lazy
|
||||
if ( cumulativeMetrics == null )
|
||||
cumulativeMetrics = readsDataSource == null ? new ReadMetrics() : readsDataSource.getCumulativeReadMetrics();
|
||||
return cumulativeMetrics;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the global ThreadEfficiencyMonitor, if there is one
|
||||
*
|
||||
* @return the monitor, or null if none is active
|
||||
*/
|
||||
public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() {
|
||||
return threadEfficiencyMonitor;
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk;
|
|||
import net.sf.picard.filter.SamRecordFilter;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
|
@ -119,11 +118,18 @@ public class ReadMetrics implements Cloneable {
|
|||
return nRecords;
|
||||
}
|
||||
|
||||
/**
|
||||
* Increments the number of 'iterations' (one call of filter/map/reduce sequence) completed.
|
||||
*/
|
||||
public void incrementNumIterations(final long by) {
|
||||
nRecords += by;
|
||||
}
|
||||
|
||||
/**
|
||||
* Increments the number of 'iterations' (one call of filter/map/reduce sequence) completed.
|
||||
*/
|
||||
public void incrementNumIterations() {
|
||||
nRecords++;
|
||||
incrementNumIterations(1);
|
||||
}
|
||||
|
||||
public long getNumReadsSeen() {
|
||||
|
|
|
|||
|
|
@ -1,15 +1,15 @@
|
|||
package org.broadinstitute.sting.gatk;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.filters.ReadFilter;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
/**
|
||||
* User: hanna
|
||||
* Date: May 14, 2009
|
||||
|
|
@ -34,12 +34,9 @@ public class ReadProperties {
|
|||
private final DownsamplingMethod downsamplingMethod;
|
||||
private final ValidationExclusion exclusionList;
|
||||
private final Collection<ReadFilter> supplementalFilters;
|
||||
private final List<ReadTransformer> readTransformers;
|
||||
private final boolean includeReadsWithDeletionAtLoci;
|
||||
private final boolean useOriginalBaseQualities;
|
||||
private final BAQ.CalculationMode cmode;
|
||||
private final BAQ.QualityMode qmode;
|
||||
private final IndexedFastaSequenceFile refReader; // read for BAQ, if desired
|
||||
private final BaseRecalibration bqsrApplier;
|
||||
private final byte defaultBaseQualities;
|
||||
|
||||
/**
|
||||
|
|
@ -95,6 +92,11 @@ public class ReadProperties {
|
|||
return supplementalFilters;
|
||||
}
|
||||
|
||||
|
||||
public List<ReadTransformer> getReadTransformers() {
|
||||
return readTransformers;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return whether to use original base qualities.
|
||||
* @return Whether to use original base qualities.
|
||||
|
|
@ -103,16 +105,6 @@ public class ReadProperties {
|
|||
return useOriginalBaseQualities;
|
||||
}
|
||||
|
||||
|
||||
public BAQ.QualityMode getBAQQualityMode() { return qmode; }
|
||||
public BAQ.CalculationMode getBAQCalculationMode() { return cmode; }
|
||||
|
||||
public IndexedFastaSequenceFile getRefReader() {
|
||||
return refReader;
|
||||
}
|
||||
|
||||
public BaseRecalibration getBQSRApplier() { return bqsrApplier; }
|
||||
|
||||
/**
|
||||
* @return Default base quality value to fill reads missing base quality information.
|
||||
*/
|
||||
|
|
@ -134,9 +126,6 @@ public class ReadProperties {
|
|||
* @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method
|
||||
* will explicitly list reads with deletion over the current reference base; otherwise, only observed
|
||||
* bases will be seen in the pileups, and the deletions will be skipped silently.
|
||||
* @param cmode How should we apply the BAQ calculation to the reads?
|
||||
* @param qmode How should we apply the BAQ calculation to the reads?
|
||||
* @param refReader if applyBAQ is true, must be a valid pointer to a indexed fasta file reads so we can get the ref bases for BAQ calculation
|
||||
* @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality.
|
||||
*/
|
||||
public ReadProperties( Collection<SAMReaderID> samFiles,
|
||||
|
|
@ -146,11 +135,8 @@ public class ReadProperties {
|
|||
DownsamplingMethod downsamplingMethod,
|
||||
ValidationExclusion exclusionList,
|
||||
Collection<ReadFilter> supplementalFilters,
|
||||
List<ReadTransformer> readTransformers,
|
||||
boolean includeReadsWithDeletionAtLoci,
|
||||
BAQ.CalculationMode cmode,
|
||||
BAQ.QualityMode qmode,
|
||||
IndexedFastaSequenceFile refReader,
|
||||
BaseRecalibration bqsrApplier,
|
||||
byte defaultBaseQualities) {
|
||||
this.readers = samFiles;
|
||||
this.header = header;
|
||||
|
|
@ -158,12 +144,9 @@ public class ReadProperties {
|
|||
this.downsamplingMethod = downsamplingMethod == null ? DownsamplingMethod.NONE : downsamplingMethod;
|
||||
this.exclusionList = exclusionList == null ? new ValidationExclusion() : exclusionList;
|
||||
this.supplementalFilters = supplementalFilters;
|
||||
this.readTransformers = readTransformers;
|
||||
this.includeReadsWithDeletionAtLoci = includeReadsWithDeletionAtLoci;
|
||||
this.useOriginalBaseQualities = useOriginalBaseQualities;
|
||||
this.cmode = cmode;
|
||||
this.qmode = qmode;
|
||||
this.refReader = refReader;
|
||||
this.bqsrApplier = bqsrApplier;
|
||||
this.defaultBaseQualities = defaultBaseQualities;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,15 +27,18 @@ package org.broadinstitute.sting.gatk;
|
|||
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.filters.FilterManager;
|
||||
import org.broadinstitute.sting.gatk.filters.ReadFilter;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.help.ResourceBundleExtractorDoclet;
|
||||
import org.broadinstitute.sting.utils.text.TextFormattingUtils;
|
||||
|
||||
import java.lang.annotation.Annotation;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -303,9 +306,10 @@ public class WalkerManager extends PluginManager<Walker> {
|
|||
* downsampling method is specified on the command-line, the command-line version will
|
||||
* be used instead.
|
||||
* @param walkerClass The class of the walker to interrogate.
|
||||
* @param useExperimentalDownsampling If true, use the experimental downsampling implementation
|
||||
* @return The downsampling method, as specified by the walker. Null if none exists.
|
||||
*/
|
||||
public static DownsamplingMethod getDownsamplingMethod(Class<? extends Walker> walkerClass) {
|
||||
public static DownsamplingMethod getDownsamplingMethod(Class<? extends Walker> walkerClass, boolean useExperimentalDownsampling) {
|
||||
DownsamplingMethod downsamplingMethod = null;
|
||||
|
||||
if( walkerClass.isAnnotationPresent(Downsample.class) ) {
|
||||
|
|
@ -313,17 +317,17 @@ public class WalkerManager extends PluginManager<Walker> {
|
|||
DownsampleType type = downsampleParameters.by();
|
||||
Integer toCoverage = downsampleParameters.toCoverage() >= 0 ? downsampleParameters.toCoverage() : null;
|
||||
Double toFraction = downsampleParameters.toFraction() >= 0.0d ? downsampleParameters.toFraction() : null;
|
||||
downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction);
|
||||
downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction,useExperimentalDownsampling);
|
||||
}
|
||||
|
||||
return downsamplingMethod;
|
||||
}
|
||||
|
||||
public static BAQ.QualityMode getBAQQualityMode(Walker walker) {
|
||||
return walker.getClass().getAnnotation(BAQMode.class).QualityMode();
|
||||
public static <T extends Annotation> T getWalkerAnnotation(final Walker walker, final Class<T> clazz) {
|
||||
return walker.getClass().getAnnotation(clazz);
|
||||
}
|
||||
|
||||
public static BAQ.ApplicationTime getBAQApplicationTime(Walker walker) {
|
||||
public static ReadTransformer.ApplicationTime getBAQApplicationTime(Walker walker) {
|
||||
return walker.getClass().getAnnotation(BAQMode.class).ApplicationTime();
|
||||
}
|
||||
|
||||
|
|
@ -332,10 +336,11 @@ public class WalkerManager extends PluginManager<Walker> {
|
|||
* downsampling method is specified on the command-line, the command-line version will
|
||||
* be used instead.
|
||||
* @param walker The walker to interrogate.
|
||||
* @param useExperimentalDownsampling If true, use the experimental downsampling implementation
|
||||
* @return The downsampling method, as specified by the walker. Null if none exists.
|
||||
*/
|
||||
public static DownsamplingMethod getDownsamplingMethod(Walker walker) {
|
||||
return getDownsamplingMethod(walker.getClass());
|
||||
public static DownsamplingMethod getDownsamplingMethod(Walker walker, boolean useExperimentalDownsampling) {
|
||||
return getDownsamplingMethod(walker.getClass(), useExperimentalDownsampling);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -31,8 +31,8 @@ import org.broadinstitute.sting.commandline.Argument;
|
|||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.IntervalBinding;
|
||||
import org.broadinstitute.sting.gatk.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport;
|
||||
import org.broadinstitute.sting.gatk.samples.PedigreeValidationType;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
|
@ -41,7 +41,9 @@ import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
|
|||
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
|
|
@ -138,15 +140,11 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false)
|
||||
public boolean nonDeterministicRandomSeed = false;
|
||||
|
||||
/**
|
||||
* The override mechanism in the GATK, by default, populates the command-line arguments, then
|
||||
* the defaults from the walker annotations. Unfortunately, walker annotations should be trumped
|
||||
* by a user explicitly specifying command-line arguments.
|
||||
* TODO: Change the GATK so that walker defaults are loaded first, then command-line arguments.
|
||||
*/
|
||||
private static DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE;
|
||||
private static int DEFAULT_DOWNSAMPLING_COVERAGE = 1000;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Downsampling Arguments
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
@Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here", required = false)
|
||||
public DownsampleType downsamplingType = null;
|
||||
|
||||
|
|
@ -156,17 +154,20 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus", required = false)
|
||||
public Integer downsampleCoverage = null;
|
||||
|
||||
@Argument(fullName = "enable_experimental_downsampling", shortName = "enable_experimental_downsampling", doc = "Enable experimental engine-level downsampling", required = false)
|
||||
@Hidden
|
||||
public boolean enableExperimentalDownsampling = false;
|
||||
|
||||
/**
|
||||
* Gets the downsampling method explicitly specified by the user. If the user didn't specify
|
||||
* a default downsampling mechanism, return the default.
|
||||
* @return The explicitly specified downsampling mechanism, or the default if none exists.
|
||||
*/
|
||||
public DownsamplingMethod getDownsamplingMethod() {
|
||||
if(downsamplingType == null && downsampleFraction == null && downsampleCoverage == null)
|
||||
if ( downsamplingType == null && downsampleFraction == null && downsampleCoverage == null )
|
||||
return null;
|
||||
if(downsamplingType == null && downsampleCoverage != null)
|
||||
return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,downsampleCoverage,null);
|
||||
return new DownsamplingMethod(downsamplingType,downsampleCoverage,downsampleFraction);
|
||||
|
||||
return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction, enableExperimentalDownsampling);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -176,9 +177,11 @@ public class GATKArgumentCollection {
|
|||
public void setDownsamplingMethod(DownsamplingMethod method) {
|
||||
if (method == null)
|
||||
throw new IllegalArgumentException("method is null");
|
||||
|
||||
downsamplingType = method.type;
|
||||
downsampleCoverage = method.toCoverage;
|
||||
downsampleFraction = method.toFraction;
|
||||
enableExperimentalDownsampling = method.useExperimentalDownsampling;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -197,17 +200,14 @@ public class GATKArgumentCollection {
|
|||
// performance log arguments
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
@Argument(fullName = "performanceLog", shortName="PF", doc="If provided, a GATK runtime performance log will be written to this file", required = false)
|
||||
public File performanceLog = null;
|
||||
|
||||
/**
|
||||
* Gets the default downsampling method, returned if the user didn't specify any downsampling
|
||||
* method.
|
||||
* @return The default downsampling mechanism, or null if none exists.
|
||||
* The file name for the GATK performance log output, or null if you don't want to generate the
|
||||
* detailed performance logging table. This table is suitable for importing into R or any
|
||||
* other analysis software that can read tsv files
|
||||
*/
|
||||
public static DownsamplingMethod getDefaultDownsamplingMethod() {
|
||||
return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,DEFAULT_DOWNSAMPLING_COVERAGE,null);
|
||||
}
|
||||
@Argument(fullName = "performanceLog", shortName="PF", doc="If provided, a GATK runtime performance log will be written to this file", required = false)
|
||||
public File performanceLog = null;
|
||||
|
||||
@Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "If set, use the original base quality scores from the OQ tag when present instead of the standard scores", required=false)
|
||||
public Boolean useOriginalBaseQualities = false;
|
||||
|
|
@ -279,20 +279,40 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false)
|
||||
public ValidationExclusion.TYPE unsafe;
|
||||
|
||||
/** How many threads should be allocated to this analysis. */
|
||||
@Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false)
|
||||
public Integer numberOfThreads = 1;
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Multi-threading arguments
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* The following two arguments (num_cpu_threads, num_io_threads are TEMPORARY since Queue cannot currently support arbitrary tagged data types.
|
||||
* TODO: Kill this when I can do a tagged integer in Queue.
|
||||
* How many data threads should be allocated to this analysis? Data threads contains N cpu threads per
|
||||
* data thread, and act as completely data parallel processing, increasing the memory usage of GATK
|
||||
* by M data threads. Data threads generally scale extremely effectively, up to 24 cores
|
||||
*/
|
||||
@Argument(fullName="num_cpu_threads", shortName = "nct", doc="How many of the given threads should be allocated to the CPU", required = false)
|
||||
@Hidden
|
||||
public Integer numberOfCPUThreads = null;
|
||||
@Argument(fullName = "num_threads", shortName = "nt", doc = "How many data threads should be allocated to running this analysis.", required = false)
|
||||
public Integer numberOfDataThreads = 1;
|
||||
|
||||
/**
|
||||
* How many CPU threads should be allocated per data thread? Each CPU thread operates the map
|
||||
* cycle independently, but may run into earlier scaling problems with IO than data threads. Has
|
||||
* the benefit of not requiring X times as much memory per thread as data threads do, but rather
|
||||
* only a constant overhead.
|
||||
*/
|
||||
@Argument(fullName="num_cpu_threads_per_data_thread", shortName = "nct", doc="How many CPU threads should be allocated per data thread to running this analysis?", required = false)
|
||||
public int numberOfCPUThreadsPerDataThread = 1;
|
||||
|
||||
@Argument(fullName="num_io_threads", shortName = "nit", doc="How many of the given threads should be allocated to IO", required = false)
|
||||
@Hidden
|
||||
public Integer numberOfIOThreads = null;
|
||||
public int numberOfIOThreads = 0;
|
||||
|
||||
/**
|
||||
* By default the GATK monitors its own efficiency, but this can have a itsy-bitsy tiny
|
||||
* cost (< 0.1%) in runtime because of turning on the JavaBean. This argument allows you
|
||||
* to disable the monitor
|
||||
*/
|
||||
@Argument(fullName = "disableThreadEfficiencyMonitor", shortName = "dtem", doc = "Disable GATK efficiency monitoring", required = false)
|
||||
public Boolean disableEfficiencyMonitor = false;
|
||||
|
||||
@Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false)
|
||||
public Integer numberOfBAMFileHandles = null;
|
||||
|
|
|
|||
|
|
@ -177,7 +177,7 @@ public class ReferenceContext {
|
|||
* @return The base at the given locus from the reference.
|
||||
*/
|
||||
public byte getBase() {
|
||||
return getBases()[(int)(locus.getStart() - window.getStart())];
|
||||
return getBases()[(locus.getStart() - window.getStart())];
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -0,0 +1,143 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.providers;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.RODRecordListImpl;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
import java.util.ListIterator;
|
||||
|
||||
/**
|
||||
* Key algorithmic helper for ReadBasedReferenceOrderedData
|
||||
*
|
||||
* Takes a single iterator of features, and provides a single capability that returns
|
||||
* the list of RODs that overlap an interval. Allows sequential getOverlapping calls
|
||||
* from intervals provided that these intervals always have increasing getStart() values.
|
||||
*
|
||||
*/
|
||||
class IntervalOverlappingRODsFromStream {
|
||||
/**
|
||||
* Only held for QC purposes
|
||||
*/
|
||||
GenomeLoc lastQuery = null;
|
||||
|
||||
private final String name;
|
||||
private final LinkedList<GATKFeature> currentFeatures = new LinkedList<GATKFeature>();
|
||||
private final PeekableIterator<RODRecordList> futureFeatures;
|
||||
|
||||
/**
|
||||
* Create a new IntervalOverlappingRODsFromStream that reads elements from futureFeatures and
|
||||
* returns RODRecordLists having name
|
||||
*
|
||||
* @param name
|
||||
* @param futureFeatures
|
||||
*/
|
||||
IntervalOverlappingRODsFromStream(final String name, final PeekableIterator<RODRecordList> futureFeatures) {
|
||||
if ( futureFeatures == null ) throw new IllegalArgumentException("futureFeatures cannot be null");
|
||||
|
||||
this.name = name;
|
||||
this.futureFeatures = futureFeatures;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the list of RODs overlapping loc from this stream of RODs.
|
||||
*
|
||||
* Sequential calls to this function must obey the rule that loc2.getStart >= loc1.getStart
|
||||
*
|
||||
* @param loc the interval to query
|
||||
* @return a non-null RODRecordList containing the overlapping RODs, which may be empty
|
||||
*/
|
||||
@Ensures({"overlaps(loc, result)",
|
||||
"! futureFeatures.hasNext() || futureFeatures.peek().getLocation().isPast(loc)",
|
||||
"result != null"})
|
||||
public RODRecordList getOverlapping(final GenomeLoc loc) {
|
||||
if ( lastQuery != null && loc.getStart() < lastQuery.getStart() )
|
||||
throw new IllegalArgumentException(String.format("BUG: query interval (%s) starts before the previous interval %s", loc, lastQuery));
|
||||
|
||||
trimCurrentFeaturesToLoc(loc);
|
||||
readOverlappingFutureFeatures(loc);
|
||||
return new RODRecordListImpl(name, subsetToOverlapping(loc, currentFeatures), loc);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* For contract assurance. Checks that all bindings in loc overlap
|
||||
*
|
||||
* @param loc
|
||||
* @param bindings
|
||||
* @return
|
||||
*/
|
||||
@Requires({"loc != null", "bindings != null"})
|
||||
private boolean overlaps(final GenomeLoc loc, final RODRecordList bindings) {
|
||||
for ( final GATKFeature feature : bindings )
|
||||
if ( ! feature.getLocation().overlapsP(loc) )
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Subset the features in all to those that overlap with loc
|
||||
*
|
||||
* The current features list contains everything read that cannot be thrown away yet, but not
|
||||
* everything in there necessarily overlaps with loc. Subset to just those that do overlap
|
||||
*
|
||||
* @param loc the location that features must overlap
|
||||
* @param all the list of all features
|
||||
* @return a subset of all that overlaps with loc
|
||||
*/
|
||||
@Requires({"loc != null", "all != null"})
|
||||
@Ensures("result.size() <= all.size()")
|
||||
private Collection<GATKFeature> subsetToOverlapping(final GenomeLoc loc, final Collection<GATKFeature> all) {
|
||||
final LinkedList<GATKFeature> overlapping = new LinkedList<GATKFeature>();
|
||||
for ( final GATKFeature feature : all )
|
||||
if ( feature.getLocation().overlapsP(loc) )
|
||||
overlapping.add(feature);
|
||||
return overlapping;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update function. Remove all elements of currentFeatures that end before loc
|
||||
*
|
||||
* @param loc the location to use
|
||||
*/
|
||||
@Requires("loc != null")
|
||||
@Ensures("currentFeatures.size() <= old(currentFeatures.size())")
|
||||
private void trimCurrentFeaturesToLoc(final GenomeLoc loc) {
|
||||
final ListIterator<GATKFeature> it = currentFeatures.listIterator();
|
||||
while ( it.hasNext() ) {
|
||||
final GATKFeature feature = it.next();
|
||||
if ( feature.getLocation().isBefore(loc) )
|
||||
it.remove();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Update function: Read all elements from futureFeatures that overlap with loc
|
||||
*
|
||||
* Stops at the first element that starts before the end of loc, or the stream empties
|
||||
*
|
||||
* @param loc
|
||||
*/
|
||||
@Requires("loc != null")
|
||||
@Ensures("currentFeatures.size() >= old(currentFeatures.size())")
|
||||
private void readOverlappingFutureFeatures(final GenomeLoc loc) {
|
||||
while ( futureFeatures.hasNext() ) {
|
||||
final GenomeLoc nextLoc = futureFeatures.peek().getLocation();
|
||||
if ( nextLoc.isBefore(loc) ) {
|
||||
futureFeatures.next(); // next rod element is before loc, throw it away and keep looking
|
||||
} else if ( nextLoc.isPast(loc) ) {
|
||||
break; // next element is past loc, stop looking but don't pop it
|
||||
} else if ( nextLoc.overlapsP(loc) ) {
|
||||
// add overlapping elements to our current features, removing from stream
|
||||
for ( final GATKFeature feature : futureFeatures.next() ) {
|
||||
currentFeatures.add(feature);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.providers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.ReadProperties;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
|
||||
|
|
@ -135,8 +135,13 @@ public abstract class LocusView extends LocusIterator implements View {
|
|||
|
||||
// Cache the current and apply filtering.
|
||||
AlignmentContext current = nextLocus;
|
||||
if( sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null )
|
||||
|
||||
// The old ALL_READS downsampling implementation -- only use if we're not using the new experimental downsampling:
|
||||
if( ! sourceInfo.getDownsamplingMethod().useExperimentalDownsampling &&
|
||||
sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null ) {
|
||||
|
||||
current.downsampleToCoverage( sourceInfo.getDownsamplingMethod().toCoverage );
|
||||
}
|
||||
|
||||
// Indicate that the next operation will need to advance.
|
||||
nextLocus = null;
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ public class ManagingReferenceOrderedView implements ReferenceOrderedView {
|
|||
// todo -- warning, I removed the reference to the name from states
|
||||
bindings.add( state.iterator.seekForward(loc) );
|
||||
|
||||
return new RefMetaDataTracker(bindings, referenceContext);
|
||||
return new RefMetaDataTracker(bindings);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -23,40 +23,63 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.datasources.providers;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.ReadShard;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.TreeMap;
|
||||
|
||||
/** a ROD view for reads. This provides the Read traversals a way of getting a ReadMetaDataTracker */
|
||||
/** a ROD view for reads. This provides the Read traversals a way of getting a RefMetaDataTracker */
|
||||
public class ReadBasedReferenceOrderedView implements View {
|
||||
private final WindowedData window;
|
||||
|
||||
public ReadBasedReferenceOrderedView(ShardDataProvider provider) {
|
||||
window = new WindowedData(provider);
|
||||
provider.register(this);
|
||||
}
|
||||
// a list of the RMDDataState (location->iterators)
|
||||
private final List<RMDDataState> states = new ArrayList<RMDDataState>(1);
|
||||
private final static RefMetaDataTracker EMPTY_TRACKER = new RefMetaDataTracker();
|
||||
|
||||
/**
|
||||
* for testing only please
|
||||
*
|
||||
* @param data the window provider
|
||||
* Used to get genome locs for reads
|
||||
*/
|
||||
ReadBasedReferenceOrderedView(WindowedData data) {
|
||||
window = data;
|
||||
private final GenomeLocParser genomeLocParser;
|
||||
|
||||
/**
|
||||
* The total extent of all reads in this span. We create iterators from our RODs
|
||||
* from the start of this span, to the end.
|
||||
*/
|
||||
private final GenomeLoc shardSpan;
|
||||
|
||||
public ReadBasedReferenceOrderedView(final ShardDataProvider provider) {
|
||||
this.genomeLocParser = provider.getGenomeLocParser();
|
||||
// conditional to optimize the case where we don't have any ROD data
|
||||
this.shardSpan = provider.getReferenceOrderedData() != null ? ((ReadShard)provider.getShard()).getReadsSpan() : null;
|
||||
provider.register(this);
|
||||
|
||||
if ( provider.getReferenceOrderedData() != null && ! shardSpan.isUnmapped() ) {
|
||||
for (ReferenceOrderedDataSource dataSource : provider.getReferenceOrderedData())
|
||||
states.add(new RMDDataState(dataSource, dataSource.seek(shardSpan)));
|
||||
}
|
||||
}
|
||||
|
||||
public ReadMetaDataTracker getReferenceOrderedDataForRead(SAMRecord read) {
|
||||
return window.getTracker(read);
|
||||
|
||||
/**
|
||||
* Testing constructor
|
||||
*/
|
||||
protected ReadBasedReferenceOrderedView(final GenomeLocParser genomeLocParser,
|
||||
final GenomeLoc shardSpan,
|
||||
final List<String> names,
|
||||
final List<PeekableIterator<RODRecordList>> featureSources) {
|
||||
this.genomeLocParser = genomeLocParser;
|
||||
this.shardSpan = shardSpan;
|
||||
for ( int i = 0; i < names.size(); i++ )
|
||||
states.add(new RMDDataState(names.get(i), featureSources.get(i)));
|
||||
}
|
||||
|
||||
public Collection<Class<? extends View>> getConflictingViews() {
|
||||
|
|
@ -65,135 +88,72 @@ public class ReadBasedReferenceOrderedView implements View {
|
|||
return classes;
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if (window != null) window.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** stores a window of data, dropping RODs if we've passed the new reads start point. */
|
||||
class WindowedData {
|
||||
// the queue of possibly in-frame RODs; RODs are removed as soon as they are out of scope
|
||||
private final TreeMap<Integer, RODMetaDataContainer> mapping = new TreeMap<Integer, RODMetaDataContainer>();
|
||||
|
||||
// our current location from the last read we processed
|
||||
private GenomeLoc currentLoc;
|
||||
|
||||
// a list of the RMDDataState (location->iterators)
|
||||
private List<RMDDataState> states;
|
||||
|
||||
// the provider; where we get all our information
|
||||
private final ShardDataProvider provider;
|
||||
|
||||
/**
|
||||
* our log, which we want to capture anything from this class
|
||||
*/
|
||||
private static Logger logger = Logger.getLogger(WindowedData.class);
|
||||
|
||||
/**
|
||||
* create a WindowedData given a shard provider
|
||||
*
|
||||
* @param provider the ShardDataProvider
|
||||
*/
|
||||
public WindowedData(ShardDataProvider provider) {
|
||||
this.provider = provider;
|
||||
}
|
||||
|
||||
/**
|
||||
* load the states dynamically, since the only way to get a genome loc is from the read (the shard doesn't have one)
|
||||
*
|
||||
* @param provider the ShardDataProvider
|
||||
* @param rec the current read
|
||||
*/
|
||||
private void getStates(ShardDataProvider provider, SAMRecord rec) {
|
||||
|
||||
int stop = Integer.MAX_VALUE;
|
||||
// figure out the appropriate alignment stop
|
||||
if (provider.hasReference()) {
|
||||
stop = provider.getReference().getSequenceDictionary().getSequence(rec.getReferenceIndex()).getSequenceLength();
|
||||
}
|
||||
|
||||
// calculate the range of positions we need to look at
|
||||
GenomeLoc range = provider.getGenomeLocParser().createGenomeLoc(rec.getReferenceName(),
|
||||
rec.getAlignmentStart(),
|
||||
stop);
|
||||
states = new ArrayList<RMDDataState>();
|
||||
if (provider.getReferenceOrderedData() != null)
|
||||
for (ReferenceOrderedDataSource dataSource : provider.getReferenceOrderedData())
|
||||
states.add(new RMDDataState(dataSource, dataSource.seek(range)));
|
||||
}
|
||||
|
||||
/**
|
||||
* this function is for testing only
|
||||
*
|
||||
* @param states a list of RMDDataState to initialize with
|
||||
*/
|
||||
WindowedData(List<RMDDataState> states) {
|
||||
this.states = states;
|
||||
provider = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a ReadMetaDataTracker given the current read
|
||||
* create a RefMetaDataTracker given the current read
|
||||
*
|
||||
* @param rec the read
|
||||
*
|
||||
* @return a ReadMetaDataTracker for the read, from which you can get ROD -> read alignments
|
||||
* @return a RefMetaDataTracker for the read, from which you can get ROD -> read alignments
|
||||
*/
|
||||
public ReadMetaDataTracker getTracker(SAMRecord rec) {
|
||||
updatePosition(rec);
|
||||
return new ReadMetaDataTracker(provider.getGenomeLocParser(), rec, mapping);
|
||||
@Requires("rec != null")
|
||||
@Ensures("result != null")
|
||||
public RefMetaDataTracker getReferenceOrderedDataForRead(final SAMRecord rec) {
|
||||
if ( rec.getReadUnmappedFlag() )
|
||||
// empty RODs for unmapped reads
|
||||
return new RefMetaDataTracker();
|
||||
else
|
||||
return getReferenceOrderedDataForInterval(genomeLocParser.createGenomeLoc(rec));
|
||||
}
|
||||
|
||||
/**
|
||||
* update the position we're storing
|
||||
*
|
||||
* @param rec the read to use for start and end
|
||||
*/
|
||||
private void updatePosition(SAMRecord rec) {
|
||||
if (states == null) getStates(this.provider, rec);
|
||||
currentLoc = provider.getGenomeLocParser().createGenomeLoc(rec);
|
||||
|
||||
// flush the queue looking for records we've passed over
|
||||
while (mapping.size() > 0 && mapping.firstKey() < currentLoc.getStart())
|
||||
mapping.pollFirstEntry(); // toss away records that we've passed
|
||||
|
||||
// add new data to the queue
|
||||
for (RMDDataState state : states) {
|
||||
// move into position
|
||||
while (state.iterator.hasNext() && state.iterator.peekNextLocation().isBefore(currentLoc))
|
||||
state.iterator.next();
|
||||
while (state.iterator.hasNext() && state.iterator.peekNextLocation().overlapsP(currentLoc)) {
|
||||
RODRecordList list = state.iterator.next();
|
||||
for (GATKFeature datum : list) {
|
||||
if (!mapping.containsKey(list.getLocation().getStart()))
|
||||
mapping.put(list.getLocation().getStart(), new RODMetaDataContainer());
|
||||
mapping.get(list.getLocation().getStart()).addEntry(datum);
|
||||
}
|
||||
}
|
||||
@Requires({"interval != null", "shardSpan == null || shardSpan.isUnmapped() || shardSpan.containsP(interval)"})
|
||||
@Ensures("result != null")
|
||||
public RefMetaDataTracker getReferenceOrderedDataForInterval(final GenomeLoc interval) {
|
||||
if ( states.isEmpty() || shardSpan.isUnmapped() ) // optimization for no bindings (common for read walkers)
|
||||
return EMPTY_TRACKER;
|
||||
else {
|
||||
final List<RODRecordList> bindings = new ArrayList<RODRecordList>(states.size());
|
||||
for ( final RMDDataState state : states )
|
||||
bindings.add(state.stream.getOverlapping(interval));
|
||||
return new RefMetaDataTracker(bindings);
|
||||
}
|
||||
}
|
||||
|
||||
/** Closes the current view. */
|
||||
/**
|
||||
* Closes the current view.
|
||||
*/
|
||||
public void close() {
|
||||
if (states == null) return;
|
||||
for (RMDDataState state : states)
|
||||
state.dataSource.close( state.iterator );
|
||||
for (final RMDDataState state : states)
|
||||
state.close();
|
||||
|
||||
// Clear out the existing data so that post-close() accesses to this data will fail-fast.
|
||||
states = null;
|
||||
states.clear();
|
||||
}
|
||||
|
||||
/** Models the traversal state of a given ROD lane. */
|
||||
private static class RMDDataState {
|
||||
public final ReferenceOrderedDataSource dataSource;
|
||||
public final IntervalOverlappingRODsFromStream stream;
|
||||
private final LocationAwareSeekableRODIterator iterator;
|
||||
|
||||
}
|
||||
public RMDDataState(ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator) {
|
||||
this.dataSource = dataSource;
|
||||
this.iterator = iterator;
|
||||
this.stream = new IntervalOverlappingRODsFromStream(dataSource.getName(), new PeekableIterator<RODRecordList>(iterator));
|
||||
}
|
||||
|
||||
/** Models the traversal state of a given ROD lane. */
|
||||
class RMDDataState {
|
||||
public final ReferenceOrderedDataSource dataSource;
|
||||
public final LocationAwareSeekableRODIterator iterator;
|
||||
/**
|
||||
* For testing
|
||||
*/
|
||||
public RMDDataState(final String name, final PeekableIterator<RODRecordList> iterator) {
|
||||
this.dataSource = null;
|
||||
this.iterator = null;
|
||||
this.stream = new IntervalOverlappingRODsFromStream(name, new PeekableIterator<RODRecordList>(iterator));
|
||||
}
|
||||
|
||||
public RMDDataState(ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator) {
|
||||
this.dataSource = dataSource;
|
||||
this.iterator = iterator;
|
||||
public void close() {
|
||||
if ( dataSource != null )
|
||||
dataSource.close( iterator );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -59,16 +59,18 @@ public class ReadReferenceView extends ReferenceView {
|
|||
}
|
||||
|
||||
public byte[] getBases() {
|
||||
// System.out.printf("Getting bases for location %s%n", loc);
|
||||
// throw new StingException("x");
|
||||
return getReferenceBases(loc);
|
||||
}
|
||||
}
|
||||
|
||||
public ReferenceContext getReferenceContext( SAMRecord read ) {
|
||||
/**
|
||||
* Return a reference context appropriate for the span of read
|
||||
*
|
||||
* @param read the mapped read to test
|
||||
* @return
|
||||
*/
|
||||
public ReferenceContext getReferenceContext( final SAMRecord read ) {
|
||||
GenomeLoc loc = genomeLocParser.createGenomeLoc(read);
|
||||
// byte[] bases = super.getReferenceBases(loc);
|
||||
// return new ReferenceContext( loc, loc, bases );
|
||||
return new ReferenceContext( genomeLocParser, loc, loc, getReferenceBasesProvider(loc) );
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -101,7 +101,7 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView {
|
|||
public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc, ReferenceContext referenceContext ) {
|
||||
// special case the interval again -- add it into the ROD
|
||||
if ( interval != null ) { allTracksHere.add(interval); }
|
||||
return new RefMetaDataTracker(allTracksHere, referenceContext);
|
||||
return new RefMetaDataTracker(allTracksHere);
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
|
|
|
|||
|
|
@ -94,6 +94,13 @@ public abstract class ShardDataProvider {
|
|||
return referenceOrderedData;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if reference ordered data will be provided by this shard
|
||||
*/
|
||||
public boolean hasReferenceOrderedData() {
|
||||
return ! getReferenceOrderedData().isEmpty();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a data provider for the shard given the reads and reference.
|
||||
* @param shard The chunk of data over which traversals happen.
|
||||
|
|
|
|||
|
|
@ -6,11 +6,9 @@ import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
|||
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
*
|
||||
|
|
@ -58,6 +56,15 @@ public class ReadShard extends Shard {
|
|||
MAX_READS = bufferSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* What read buffer size are we using?
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public static int getReadBufferSize() {
|
||||
return MAX_READS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if this shard is meant to buffer reads, rather
|
||||
* than just holding pointers to their locations.
|
||||
|
|
@ -116,4 +123,33 @@ public class ReadShard extends Shard {
|
|||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the full span from the start of the left most read to the end of the right most one
|
||||
*
|
||||
* Note this may be different than the getLocation() of the shard, as this reflects the
|
||||
* targeted span, not the actual span of reads
|
||||
*
|
||||
* @return the genome loc representing the span of these reads on the genome
|
||||
*/
|
||||
public GenomeLoc getReadsSpan() {
|
||||
if ( isUnmapped() || super.getGenomeLocs() == null || reads.isEmpty() )
|
||||
return super.getLocation();
|
||||
else {
|
||||
int start = Integer.MAX_VALUE;
|
||||
int stop = Integer.MIN_VALUE;
|
||||
String contig = null;
|
||||
|
||||
for ( final SAMRecord read : reads ) {
|
||||
if ( contig != null && ! read.getReferenceName().equals(contig) )
|
||||
throw new ReviewedStingException("ReadShard contains reads spanning contig boundaries, which is no longer allowed. "
|
||||
+ "First contig is " + contig + " next read was " + read.getReferenceName() );
|
||||
contig = read.getReferenceName();
|
||||
if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart();
|
||||
if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd();
|
||||
}
|
||||
|
||||
return parser.createGenomeLoc(contig, start, stop);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,14 +24,15 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.picard.sam.MergingSamRecordIterator;
|
||||
import net.sf.picard.sam.SamFileHeaderMerger;
|
||||
import net.sf.samtools.*;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import net.sf.samtools.util.RuntimeIOException;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.downsampling.*;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.ReadMetrics;
|
||||
import org.broadinstitute.sting.gatk.ReadProperties;
|
||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||
|
|
@ -42,12 +43,9 @@ import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
|||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.SimpleTimer;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.baq.BAQSamIterator;
|
||||
import org.broadinstitute.sting.utils.baq.ReadTransformingIterator;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.recalibration.BQSRSamIterator;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory;
|
||||
|
||||
import java.io.File;
|
||||
|
|
@ -156,6 +154,8 @@ public class SAMDataSource {
|
|||
*/
|
||||
private final ThreadAllocation threadAllocation;
|
||||
|
||||
private final boolean expandShardsForDownsampling;
|
||||
|
||||
/**
|
||||
* Create a new SAM data source given the supplied read metadata.
|
||||
* @param samFiles list of reads files.
|
||||
|
|
@ -200,11 +200,8 @@ public class SAMDataSource {
|
|||
downsamplingMethod,
|
||||
exclusionList,
|
||||
supplementalFilters,
|
||||
Collections.<ReadTransformer>emptyList(),
|
||||
includeReadsWithDeletionAtLoci,
|
||||
BAQ.CalculationMode.OFF,
|
||||
BAQ.QualityMode.DONT_MODIFY,
|
||||
null, // no BAQ
|
||||
null, // no BQSR
|
||||
(byte) -1,
|
||||
false);
|
||||
}
|
||||
|
|
@ -234,11 +231,8 @@ public class SAMDataSource {
|
|||
DownsamplingMethod downsamplingMethod,
|
||||
ValidationExclusion exclusionList,
|
||||
Collection<ReadFilter> supplementalFilters,
|
||||
List<ReadTransformer> readTransformers,
|
||||
boolean includeReadsWithDeletionAtLoci,
|
||||
BAQ.CalculationMode cmode,
|
||||
BAQ.QualityMode qmode,
|
||||
IndexedFastaSequenceFile refReader,
|
||||
BaseRecalibration bqsrApplier,
|
||||
byte defaultBaseQualities,
|
||||
boolean removeProgramRecords) {
|
||||
this.readMetrics = new ReadMetrics();
|
||||
|
|
@ -262,7 +256,7 @@ public class SAMDataSource {
|
|||
else {
|
||||
// Choose a sensible default for the read buffer size. For the moment, we're picking 1000 reads per BAM per shard (which effectively
|
||||
// will mean per-thread once ReadWalkers are parallelized) with a max cap of 250K reads in memory at once.
|
||||
ReadShard.setReadBufferSize(Math.min(1000*samFiles.size(),250000));
|
||||
ReadShard.setReadBufferSize(Math.min(10000*samFiles.size(),250000));
|
||||
}
|
||||
|
||||
resourcePool = new SAMResourcePool(Integer.MAX_VALUE);
|
||||
|
|
@ -308,13 +302,15 @@ public class SAMDataSource {
|
|||
downsamplingMethod,
|
||||
exclusionList,
|
||||
supplementalFilters,
|
||||
readTransformers,
|
||||
includeReadsWithDeletionAtLoci,
|
||||
cmode,
|
||||
qmode,
|
||||
refReader,
|
||||
bqsrApplier,
|
||||
defaultBaseQualities);
|
||||
|
||||
expandShardsForDownsampling = readProperties.getDownsamplingMethod() != null &&
|
||||
readProperties.getDownsamplingMethod().useExperimentalDownsampling &&
|
||||
readProperties.getDownsamplingMethod().type != DownsampleType.NONE &&
|
||||
readProperties.getDownsamplingMethod().toCoverage != null;
|
||||
|
||||
// cache the read group id (original) -> read group id (merged)
|
||||
// and read group id (merged) -> read group id (original) mappings.
|
||||
for(SAMReaderID id: readerIDs) {
|
||||
|
|
@ -470,6 +466,16 @@ public class SAMDataSource {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Are we expanding shards as necessary to prevent shard boundaries from occurring at improper places?
|
||||
*
|
||||
* @return true if we are using expanded shards, otherwise false
|
||||
*/
|
||||
public boolean usingExpandedShards() {
|
||||
return expandShardsForDownsampling;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Fill the given buffering shard with reads.
|
||||
* @param shard Shard to fill.
|
||||
|
|
@ -486,9 +492,40 @@ public class SAMDataSource {
|
|||
|
||||
CloseableIterator<SAMRecord> iterator = getIterator(readers,shard,sortOrder == SAMFileHeader.SortOrder.coordinate);
|
||||
while(!shard.isBufferFull() && iterator.hasNext()) {
|
||||
read = iterator.next();
|
||||
shard.addRead(read);
|
||||
noteFilePositionUpdate(positionUpdates,read);
|
||||
final SAMRecord nextRead = iterator.next();
|
||||
if ( read == null || (nextRead.getReferenceIndex().equals(read.getReferenceIndex())) ) {
|
||||
// only add reads to the shard if they are on the same contig
|
||||
read = nextRead;
|
||||
shard.addRead(read);
|
||||
noteFilePositionUpdate(positionUpdates,read);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// If the reads are sorted in coordinate order, ensure that all reads
|
||||
// having the same alignment start become part of the same shard, to allow
|
||||
// downsampling to work better across shard boundaries. Note that because our
|
||||
// read stream has already been fed through the positional downsampler, which
|
||||
// ensures that at each alignment start position there are no more than dcov
|
||||
// reads, we're in no danger of accidentally creating a disproportionately huge
|
||||
// shard
|
||||
if ( expandShardsForDownsampling && sortOrder == SAMFileHeader.SortOrder.coordinate ) {
|
||||
while ( iterator.hasNext() ) {
|
||||
SAMRecord additionalRead = iterator.next();
|
||||
|
||||
// Stop filling the shard as soon as we encounter a read having a different
|
||||
// alignment start or contig from the last read added in the earlier loop
|
||||
// above, or an unmapped read
|
||||
if ( read == null ||
|
||||
additionalRead.getReadUnmappedFlag() ||
|
||||
! additionalRead.getReferenceIndex().equals(read.getReferenceIndex()) ||
|
||||
additionalRead.getAlignmentStart() != read.getAlignmentStart() ) {
|
||||
break;
|
||||
}
|
||||
shard.addRead(additionalRead);
|
||||
noteFilePositionUpdate(positionUpdates, additionalRead);
|
||||
}
|
||||
}
|
||||
|
||||
// If the reads are sorted in queryname order, ensure that all reads
|
||||
|
|
@ -585,6 +622,7 @@ public class SAMDataSource {
|
|||
iterator = new MalformedBAMErrorReformatingIterator(id.samFile, iterator);
|
||||
if(shard.getGenomeLocs().size() > 0)
|
||||
iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs());
|
||||
|
||||
iteratorMap.put(readers.getReader(id), iterator);
|
||||
}
|
||||
|
||||
|
|
@ -597,10 +635,7 @@ public class SAMDataSource {
|
|||
readProperties.getDownsamplingMethod().toFraction,
|
||||
readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
|
||||
readProperties.getSupplementalFilters(),
|
||||
readProperties.getBAQCalculationMode(),
|
||||
readProperties.getBAQQualityMode(),
|
||||
readProperties.getRefReader(),
|
||||
readProperties.getBQSRApplier(),
|
||||
readProperties.getReadTransformers(),
|
||||
readProperties.defaultBaseQualities());
|
||||
}
|
||||
|
||||
|
|
@ -667,40 +702,62 @@ public class SAMDataSource {
|
|||
Double downsamplingFraction,
|
||||
Boolean noValidationOfReadOrder,
|
||||
Collection<ReadFilter> supplementalFilters,
|
||||
BAQ.CalculationMode cmode,
|
||||
BAQ.QualityMode qmode,
|
||||
IndexedFastaSequenceFile refReader,
|
||||
BaseRecalibration bqsrApplier,
|
||||
List<ReadTransformer> readTransformers,
|
||||
byte defaultBaseQualities) {
|
||||
|
||||
// *********************************************************************************** //
|
||||
// * NOTE: ALL FILTERING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * //
|
||||
// * (otherwise we will process something that we may end up throwing away) * //
|
||||
// *********************************************************************************** //
|
||||
// ************************************************************************************************ //
|
||||
// * NOTE: ALL FILTERING/DOWNSAMPLING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * //
|
||||
// * (otherwise we will process something that we may end up throwing away) * //
|
||||
// ************************************************************************************************ //
|
||||
|
||||
if (downsamplingFraction != null)
|
||||
wrappedIterator = new DownsampleIterator(wrappedIterator, downsamplingFraction);
|
||||
wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters));
|
||||
|
||||
if ( readProperties.getDownsamplingMethod().useExperimentalDownsampling ) {
|
||||
wrappedIterator = applyDownsamplingIterator(wrappedIterator);
|
||||
}
|
||||
|
||||
// Use the old fractional downsampler only if we're not using experimental downsampling:
|
||||
if ( ! readProperties.getDownsamplingMethod().useExperimentalDownsampling && downsamplingFraction != null )
|
||||
wrappedIterator = new LegacyDownsampleIterator(wrappedIterator, downsamplingFraction);
|
||||
|
||||
// unless they've said not to validate read ordering (!noValidationOfReadOrder) and we've enabled verification,
|
||||
// verify the read ordering by applying a sort order iterator
|
||||
if (!noValidationOfReadOrder && enableVerification)
|
||||
wrappedIterator = new VerifyingSamIterator(genomeLocParser,wrappedIterator);
|
||||
|
||||
wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters));
|
||||
wrappedIterator = new VerifyingSamIterator(wrappedIterator);
|
||||
|
||||
if (useOriginalBaseQualities || defaultBaseQualities >= 0)
|
||||
// only wrap if we are replacing the original qualities or using a default base quality
|
||||
wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities);
|
||||
|
||||
if (bqsrApplier != null)
|
||||
wrappedIterator = new BQSRSamIterator(wrappedIterator, bqsrApplier);
|
||||
|
||||
if (cmode != BAQ.CalculationMode.OFF)
|
||||
wrappedIterator = new BAQSamIterator(refReader, wrappedIterator, cmode, qmode);
|
||||
// set up read transformers
|
||||
for ( final ReadTransformer readTransformer : readTransformers ) {
|
||||
if ( readTransformer.enabled() && readTransformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_INPUT )
|
||||
wrappedIterator = new ReadTransformingIterator(wrappedIterator, readTransformer);
|
||||
}
|
||||
|
||||
return wrappedIterator;
|
||||
}
|
||||
|
||||
protected StingSAMIterator applyDownsamplingIterator( StingSAMIterator wrappedIterator ) {
|
||||
if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) {
|
||||
ReadsDownsamplerFactory<SAMRecord> downsamplerFactory = readProperties.getDownsamplingMethod().toCoverage != null ?
|
||||
new SimplePositionalDownsamplerFactory<SAMRecord>(readProperties.getDownsamplingMethod().toCoverage) :
|
||||
new FractionalDownsamplerFactory<SAMRecord>(readProperties.getDownsamplingMethod().toFraction);
|
||||
|
||||
return new PerSampleDownsamplingReadsIterator(wrappedIterator, downsamplerFactory);
|
||||
}
|
||||
else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) {
|
||||
ReadsDownsampler<SAMRecord> downsampler = readProperties.getDownsamplingMethod().toCoverage != null ?
|
||||
new SimplePositionalDownsampler<SAMRecord>(readProperties.getDownsamplingMethod().toCoverage) :
|
||||
new FractionalDownsampler<SAMRecord>(readProperties.getDownsamplingMethod().toFraction);
|
||||
|
||||
return new DownsamplingReadsIterator(wrappedIterator, downsampler);
|
||||
}
|
||||
|
||||
return wrappedIterator;
|
||||
}
|
||||
|
||||
|
||||
private class SAMResourcePool {
|
||||
/**
|
||||
* How many entries can be cached in this resource pool?
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
package org.broadinstitute.sting.gatk;
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
/**
|
||||
* Type of downsampling method to invoke.
|
||||
|
|
@ -28,49 +28,92 @@ import java.util.Collection;
|
|||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The basic downsampler API, with no reads-specific operations
|
||||
* The basic downsampler API, with no reads-specific operations.
|
||||
*
|
||||
* Downsamplers that extend this interface rather than the ReadsDownsampler interface can handle
|
||||
* any kind of item, however they cannot be wrapped within a DownsamplingReadsIterator or a
|
||||
* PerSampleDownsamplingReadsIterator.
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public interface Downsampler<T> {
|
||||
|
||||
/*
|
||||
* Submit one item to the downsampler for consideration . Some downsamplers will be able to determine
|
||||
/**
|
||||
* Submit one item to the downsampler for consideration. Some downsamplers will be able to determine
|
||||
* immediately whether the item survives the downsampling process, while others will need to see
|
||||
* more items before making that determination.
|
||||
*
|
||||
* @param item the individual item to submit to the downsampler for consideration
|
||||
*/
|
||||
public void submit( T item );
|
||||
|
||||
/*
|
||||
* Submit a collection of items to the downsampler for consideration.
|
||||
/**
|
||||
* Submit a collection of items to the downsampler for consideration. Should be equivalent to calling
|
||||
* submit() on each individual item in the collection.
|
||||
*
|
||||
* @param items the collection of items to submit to the downsampler for consideration
|
||||
*/
|
||||
public void submit( Collection<T> items );
|
||||
|
||||
/*
|
||||
/**
|
||||
* Are there items that have survived the downsampling process waiting to be retrieved?
|
||||
*
|
||||
* @return true if this downsampler has > 0 finalized items, otherwise false
|
||||
*/
|
||||
public boolean hasDownsampledItems();
|
||||
public boolean hasFinalizedItems();
|
||||
|
||||
/*
|
||||
* Return (and remove) all items that have survived downsampling and are waiting to be retrieved.
|
||||
/**
|
||||
* Return (and *remove*) all items that have survived downsampling and are waiting to be retrieved.
|
||||
*
|
||||
* @return a list of all finalized items this downsampler contains, or an empty list if there are none
|
||||
*/
|
||||
public List<T> consumeDownsampledItems();
|
||||
public List<T> consumeFinalizedItems();
|
||||
|
||||
/*
|
||||
/**
|
||||
* Are there items stored in this downsampler that it doesn't yet know whether they will
|
||||
* ultimately survive the downsampling process?
|
||||
*
|
||||
* @return true if this downsampler has > 0 pending items, otherwise false
|
||||
*/
|
||||
public boolean hasPendingItems();
|
||||
|
||||
/*
|
||||
/**
|
||||
* Peek at the first finalized item stored in this downsampler (or null if there are no finalized items)
|
||||
*
|
||||
* @return the first finalized item in this downsampler (the item is not removed from the downsampler by this call),
|
||||
* or null if there are none
|
||||
*/
|
||||
public T peekFinalized();
|
||||
|
||||
/**
|
||||
* Peek at the first pending item stored in this downsampler (or null if there are no pending items)
|
||||
*
|
||||
* @return the first pending item stored in this downsampler (the item is not removed from the downsampler by this call),
|
||||
* or null if there are none
|
||||
*/
|
||||
public T peekPending();
|
||||
|
||||
/**
|
||||
* Returns the number of items discarded (so far) during the downsampling process
|
||||
*
|
||||
* @return the number of items that have been submitted to this downsampler and discarded in the process of
|
||||
* downsampling
|
||||
*/
|
||||
public int getNumberOfDiscardedItems();
|
||||
|
||||
/**
|
||||
* Used to tell the downsampler that no more items will be submitted to it, and that it should
|
||||
* finalize any pending items.
|
||||
*/
|
||||
public void signalEndOfInput();
|
||||
|
||||
/*
|
||||
* Reset the downsampler to a clean state, devoid of any pending/downsampled items or tracked state
|
||||
* information.
|
||||
/**
|
||||
* Empty the downsampler of all finalized/pending items
|
||||
*/
|
||||
public void clear();
|
||||
|
||||
/**
|
||||
* Reset stats in the downsampler such as the number of discarded items *without* clearing the downsampler of items
|
||||
*/
|
||||
public void reset();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,153 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
/**
|
||||
* Describes the method for downsampling reads at a given locus.
|
||||
*/
|
||||
|
||||
public class DownsamplingMethod {
|
||||
/**
|
||||
* Type of downsampling to perform.
|
||||
*/
|
||||
public final DownsampleType type;
|
||||
|
||||
/**
|
||||
* Actual downsampling target is specified as an integer number of reads.
|
||||
*/
|
||||
public final Integer toCoverage;
|
||||
|
||||
/**
|
||||
* Actual downsampling target is specified as a fraction of total available reads.
|
||||
*/
|
||||
public final Double toFraction;
|
||||
|
||||
/**
|
||||
* Use the new experimental downsampling?
|
||||
*/
|
||||
public final boolean useExperimentalDownsampling;
|
||||
|
||||
/**
|
||||
* Expresses no downsampling applied at all.
|
||||
*/
|
||||
public static final DownsamplingMethod NONE = new DownsamplingMethod(DownsampleType.NONE,null,null,false);
|
||||
|
||||
/**
|
||||
* Default type to use if no type is specified
|
||||
*/
|
||||
public static DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE;
|
||||
|
||||
/**
|
||||
* Default target coverage for locus-based traversals
|
||||
*/
|
||||
public static int DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE = 1000;
|
||||
|
||||
public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction, boolean useExperimentalDownsampling ) {
|
||||
this.type = type != null ? type : DEFAULT_DOWNSAMPLING_TYPE;
|
||||
this.toCoverage = toCoverage;
|
||||
this.toFraction = toFraction;
|
||||
this.useExperimentalDownsampling = useExperimentalDownsampling;
|
||||
|
||||
if ( type == DownsampleType.NONE ) {
|
||||
toCoverage = null;
|
||||
toFraction = null;
|
||||
}
|
||||
|
||||
validate();
|
||||
}
|
||||
|
||||
private void validate() {
|
||||
// Can't leave toFraction and toCoverage null unless type is NONE
|
||||
if ( type != DownsampleType.NONE && toFraction == null && toCoverage == null )
|
||||
throw new UserException.CommandLineException("Must specify either toFraction or toCoverage when downsampling.");
|
||||
|
||||
// Fraction and coverage cannot both be specified.
|
||||
if ( toFraction != null && toCoverage != null )
|
||||
throw new UserException.CommandLineException("Downsampling coverage and fraction are both specified. Please choose only one.");
|
||||
|
||||
// toCoverage must be > 0 when specified
|
||||
if ( toCoverage != null && toCoverage <= 0 ) {
|
||||
throw new UserException.CommandLineException("toCoverage must be > 0 when downsampling to coverage");
|
||||
}
|
||||
|
||||
// toFraction must be >= 0.0 and <= 1.0 when specified
|
||||
if ( toFraction != null && (toFraction < 0.0 || toFraction > 1.0) ) {
|
||||
throw new UserException.CommandLineException("toFraction must be >= 0.0 and <= 1.0 when downsampling to a fraction of reads");
|
||||
}
|
||||
|
||||
// Some restrictions only exist for the old downsampling implementation:
|
||||
if ( ! useExperimentalDownsampling ) {
|
||||
// By sample downsampling does not work with a fraction of reads in the old downsampling implementation
|
||||
if( type == DownsampleType.BY_SAMPLE && toFraction != null )
|
||||
throw new UserException.CommandLineException("Cannot downsample to fraction with the BY_SAMPLE method");
|
||||
}
|
||||
|
||||
// Some restrictions only exist for the new downsampling implementation:
|
||||
if ( useExperimentalDownsampling ) {
|
||||
if ( type == DownsampleType.ALL_READS && toCoverage != null ) {
|
||||
throw new UserException.CommandLineException("Cannot downsample to coverage with the ALL_READS method in the experimental downsampling implementation");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuilder builder = new StringBuilder("Downsampling Settings: ");
|
||||
|
||||
if ( type == DownsampleType.NONE ) {
|
||||
builder.append("No downsampling");
|
||||
}
|
||||
else {
|
||||
builder.append(String.format("Method: %s ", type));
|
||||
|
||||
if ( toCoverage != null ) {
|
||||
builder.append(String.format("Target Coverage: %d ", toCoverage));
|
||||
}
|
||||
else {
|
||||
builder.append(String.format("Target Fraction: %.2f ", toFraction));
|
||||
}
|
||||
|
||||
if ( useExperimentalDownsampling ) {
|
||||
builder.append("Using Experimental Downsampling");
|
||||
}
|
||||
}
|
||||
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
public static DownsamplingMethod getDefaultDownsamplingMethod( Walker walker, boolean useExperimentalDownsampling ) {
|
||||
if ( walker instanceof LocusWalker || walker instanceof ActiveRegionWalker ) {
|
||||
return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE, DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE,
|
||||
null, useExperimentalDownsampling);
|
||||
}
|
||||
else {
|
||||
return new DownsamplingMethod(DownsampleType.NONE, null, null, useExperimentalDownsampling);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -33,7 +33,8 @@ import java.util.NoSuchElementException;
|
|||
|
||||
|
||||
/**
|
||||
* StingSAMIterator wrapper around our generic reads downsampler interface
|
||||
* StingSAMIterator wrapper around our generic reads downsampler interface. Converts the push-style
|
||||
* downsampler interface to a pull model.
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
|
|
@ -42,35 +43,50 @@ public class DownsamplingReadsIterator implements StingSAMIterator {
|
|||
private StingSAMIterator nestedSAMIterator;
|
||||
private ReadsDownsampler<SAMRecord> downsampler;
|
||||
private Collection<SAMRecord> downsampledReadsCache;
|
||||
private Iterator<SAMRecord> downsampledReadsCacheIterator;
|
||||
private SAMRecord nextRead = null;
|
||||
private Iterator<SAMRecord> downsampledReadsCacheIterator = null;
|
||||
|
||||
/**
|
||||
* @param iter wrapped iterator from which this iterator will pull reads
|
||||
* @param downsampler downsampler through which the reads will be fed
|
||||
*/
|
||||
public DownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsampler<SAMRecord> downsampler ) {
|
||||
nestedSAMIterator = iter;
|
||||
this.downsampler = downsampler;
|
||||
fillDownsampledReadsCache();
|
||||
|
||||
advanceToNextRead();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
if ( downsampledReadsCacheIterator.hasNext() ) {
|
||||
return true;
|
||||
}
|
||||
else if ( ! nestedSAMIterator.hasNext() || ! fillDownsampledReadsCache() ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
return nextRead != null;
|
||||
}
|
||||
|
||||
public SAMRecord next() {
|
||||
if ( ! downsampledReadsCacheIterator.hasNext() && ! fillDownsampledReadsCache() ) {
|
||||
if ( nextRead == null ) {
|
||||
throw new NoSuchElementException("next() called when there are no more items");
|
||||
}
|
||||
|
||||
return downsampledReadsCacheIterator.next();
|
||||
SAMRecord toReturn = nextRead;
|
||||
advanceToNextRead();
|
||||
|
||||
return toReturn;
|
||||
}
|
||||
|
||||
private void advanceToNextRead() {
|
||||
if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) {
|
||||
nextRead = null;
|
||||
}
|
||||
else {
|
||||
nextRead = downsampledReadsCacheIterator.next();
|
||||
}
|
||||
}
|
||||
|
||||
private boolean readyToReleaseReads() {
|
||||
return downsampledReadsCacheIterator != null && downsampledReadsCacheIterator.hasNext();
|
||||
}
|
||||
|
||||
private boolean fillDownsampledReadsCache() {
|
||||
while ( nestedSAMIterator.hasNext() && ! downsampler.hasDownsampledItems() ) {
|
||||
while ( nestedSAMIterator.hasNext() && ! downsampler.hasFinalizedItems() ) {
|
||||
downsampler.submit(nestedSAMIterator.next());
|
||||
}
|
||||
|
||||
|
|
@ -78,7 +94,8 @@ public class DownsamplingReadsIterator implements StingSAMIterator {
|
|||
downsampler.signalEndOfInput();
|
||||
}
|
||||
|
||||
downsampledReadsCache = downsampler.consumeDownsampledItems();
|
||||
// use returned collection directly rather than make a copy, for speed
|
||||
downsampledReadsCache = downsampler.consumeFinalizedItems();
|
||||
downsampledReadsCacheIterator = downsampledReadsCache.iterator();
|
||||
|
||||
return downsampledReadsCacheIterator.hasNext();
|
||||
|
|
|
|||
|
|
@ -33,7 +33,10 @@ import java.util.Collection;
|
|||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Fractional Downsampler: selects a specified fraction of the reads for inclusion
|
||||
* Fractional Downsampler: selects a specified fraction of the reads for inclusion.
|
||||
*
|
||||
* Since the selection is done randomly, the actual fraction of reads retained may be slightly
|
||||
* more or less than the requested fraction, depending on the total number of reads submitted.
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
|
|
@ -43,8 +46,16 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
|
|||
|
||||
private int cutoffForInclusion;
|
||||
|
||||
private int numDiscardedItems;
|
||||
|
||||
private static final int RANDOM_POOL_SIZE = 10000;
|
||||
|
||||
/**
|
||||
* Construct a FractionalDownsampler
|
||||
*
|
||||
* @param fraction Fraction of reads to preserve, between 0.0 (inclusive) and 1.0 (inclusive).
|
||||
* Actual number of reads preserved may differ randomly.
|
||||
*/
|
||||
public FractionalDownsampler( double fraction ) {
|
||||
if ( fraction < 0.0 || fraction > 1.0 ) {
|
||||
throw new ReviewedStingException("Fraction of reads to include must be between 0.0 and 1.0, inclusive");
|
||||
|
|
@ -52,12 +63,16 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
|
|||
|
||||
cutoffForInclusion = (int)(fraction * RANDOM_POOL_SIZE);
|
||||
clear();
|
||||
reset();
|
||||
}
|
||||
|
||||
public void submit( T newRead ) {
|
||||
if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoffForInclusion ) {
|
||||
selectedReads.add(newRead);
|
||||
}
|
||||
else {
|
||||
numDiscardedItems++;
|
||||
}
|
||||
}
|
||||
|
||||
public void submit( Collection<T> newReads ) {
|
||||
|
|
@ -66,11 +81,12 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
|
|||
}
|
||||
}
|
||||
|
||||
public boolean hasDownsampledItems() {
|
||||
public boolean hasFinalizedItems() {
|
||||
return selectedReads.size() > 0;
|
||||
}
|
||||
|
||||
public List<T> consumeDownsampledItems() {
|
||||
public List<T> consumeFinalizedItems() {
|
||||
// pass by reference rather than make a copy, for speed
|
||||
List<T> downsampledItems = selectedReads;
|
||||
clear();
|
||||
return downsampledItems;
|
||||
|
|
@ -80,6 +96,18 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
|
|||
return false;
|
||||
}
|
||||
|
||||
public T peekFinalized() {
|
||||
return selectedReads.isEmpty() ? null : selectedReads.get(0);
|
||||
}
|
||||
|
||||
public T peekPending() {
|
||||
return null;
|
||||
}
|
||||
|
||||
public int getNumberOfDiscardedItems() {
|
||||
return numDiscardedItems;
|
||||
}
|
||||
|
||||
public void signalEndOfInput() {
|
||||
// NO-OP
|
||||
}
|
||||
|
|
@ -88,7 +116,15 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
|
|||
selectedReads = new ArrayList<T>();
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
numDiscardedItems = 0;
|
||||
}
|
||||
|
||||
public boolean requiresCoordinateSortOrder() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public void signalNoMoreReadsBefore( T read ) {
|
||||
// NO-OP
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,45 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Factory for creating FractionalDownsamplers on demand
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class FractionalDownsamplerFactory<T extends SAMRecord> implements ReadsDownsamplerFactory<T> {
|
||||
|
||||
private double fraction;
|
||||
|
||||
public FractionalDownsamplerFactory( double fraction ) {
|
||||
this.fraction = fraction;
|
||||
}
|
||||
|
||||
public ReadsDownsampler<T> newInstance() {
|
||||
return new FractionalDownsampler<T>(fraction);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,212 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Leveling Downsampler: Given a set of Lists of arbitrary items and a target size, removes items from
|
||||
* the Lists in an even fashion until the total size of all Lists is <= the target size. Leveling
|
||||
* does not occur until all Lists have been submitted and signalEndOfInput() is called.
|
||||
*
|
||||
* The Lists should be LinkedLists for maximum efficiency during item removal, however other
|
||||
* kinds of Lists are also accepted (albeit at a slight performance penalty).
|
||||
*
|
||||
* Since this downsampler extends the Downsampler interface rather than the ReadsDownsampler interface,
|
||||
* the Lists need not contain reads. However this downsampler may not be wrapped within one of the
|
||||
* DownsamplingReadsIterators
|
||||
*
|
||||
* @param <T> the List type representing the stacks to be leveled
|
||||
* @param <E> the type of the elements of each List
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class LevelingDownsampler<T extends List<E>, E> implements Downsampler<T> {
|
||||
|
||||
private int targetSize;
|
||||
|
||||
private List<T> groups;
|
||||
|
||||
private boolean groupsAreFinalized;
|
||||
|
||||
private int numDiscardedItems;
|
||||
|
||||
/**
|
||||
* Construct a LevelingDownsampler
|
||||
*
|
||||
* @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed
|
||||
* this value -- if it does, items are removed from Lists evenly until the total size
|
||||
* is <= this value
|
||||
*/
|
||||
public LevelingDownsampler( int targetSize ) {
|
||||
this.targetSize = targetSize;
|
||||
clear();
|
||||
reset();
|
||||
}
|
||||
|
||||
public void submit( T item ) {
|
||||
groups.add(item);
|
||||
}
|
||||
|
||||
public void submit( Collection<T> items ){
|
||||
groups.addAll(items);
|
||||
}
|
||||
|
||||
public boolean hasFinalizedItems() {
|
||||
return groupsAreFinalized && groups.size() > 0;
|
||||
}
|
||||
|
||||
public List<T> consumeFinalizedItems() {
|
||||
if ( ! hasFinalizedItems() ) {
|
||||
return new ArrayList<T>();
|
||||
}
|
||||
|
||||
// pass by reference rather than make a copy, for speed
|
||||
List<T> toReturn = groups;
|
||||
clear();
|
||||
return toReturn;
|
||||
}
|
||||
|
||||
public boolean hasPendingItems() {
|
||||
return ! groupsAreFinalized && groups.size() > 0;
|
||||
}
|
||||
|
||||
public T peekFinalized() {
|
||||
return hasFinalizedItems() ? groups.get(0) : null;
|
||||
}
|
||||
|
||||
public T peekPending() {
|
||||
return hasPendingItems() ? groups.get(0) : null;
|
||||
}
|
||||
|
||||
public int getNumberOfDiscardedItems() {
|
||||
return numDiscardedItems;
|
||||
}
|
||||
|
||||
public void signalEndOfInput() {
|
||||
levelGroups();
|
||||
groupsAreFinalized = true;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
groups = new ArrayList<T>();
|
||||
groupsAreFinalized = false;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
numDiscardedItems = 0;
|
||||
}
|
||||
|
||||
private void levelGroups() {
|
||||
int totalSize = 0;
|
||||
int[] groupSizes = new int[groups.size()];
|
||||
int currentGroupIndex = 0;
|
||||
|
||||
for ( T group : groups ) {
|
||||
groupSizes[currentGroupIndex] = group.size();
|
||||
totalSize += groupSizes[currentGroupIndex];
|
||||
currentGroupIndex++;
|
||||
}
|
||||
|
||||
if ( totalSize <= targetSize ) {
|
||||
return; // no need to eliminate any items
|
||||
}
|
||||
|
||||
// We will try to remove exactly this many items, however we will refuse to allow any
|
||||
// one group to fall below size 1, and so might end up removing fewer items than this
|
||||
int numItemsToRemove = totalSize - targetSize;
|
||||
|
||||
currentGroupIndex = 0;
|
||||
int numConsecutiveUmodifiableGroups = 0;
|
||||
|
||||
// Continue until we've either removed all the items we wanted to, or we can't
|
||||
// remove any more items without violating the constraint that all groups must
|
||||
// be left with at least one item
|
||||
while ( numItemsToRemove > 0 && numConsecutiveUmodifiableGroups < groupSizes.length ) {
|
||||
if ( groupSizes[currentGroupIndex] > 1 ) {
|
||||
groupSizes[currentGroupIndex]--;
|
||||
numItemsToRemove--;
|
||||
numConsecutiveUmodifiableGroups = 0;
|
||||
}
|
||||
else {
|
||||
numConsecutiveUmodifiableGroups++;
|
||||
}
|
||||
|
||||
currentGroupIndex = (currentGroupIndex + 1) % groupSizes.length;
|
||||
}
|
||||
|
||||
// Now we actually go through and reduce each group to its new count as specified in groupSizes
|
||||
currentGroupIndex = 0;
|
||||
for ( T group : groups ) {
|
||||
downsampleOneGroup(group, groupSizes[currentGroupIndex]);
|
||||
currentGroupIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
private void downsampleOneGroup( T group, int numItemsToKeep ) {
|
||||
if ( numItemsToKeep >= group.size() ) {
|
||||
return;
|
||||
}
|
||||
|
||||
numDiscardedItems += group.size() - numItemsToKeep;
|
||||
|
||||
BitSet itemsToKeep = new BitSet(group.size());
|
||||
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(group.size(), numItemsToKeep) ) {
|
||||
itemsToKeep.set(selectedIndex);
|
||||
}
|
||||
|
||||
int currentIndex = 0;
|
||||
|
||||
// If our group is a linked list, we can remove the desired items in a single O(n) pass with an iterator
|
||||
if ( group instanceof LinkedList ) {
|
||||
Iterator iter = group.iterator();
|
||||
while ( iter.hasNext() ) {
|
||||
iter.next();
|
||||
|
||||
if ( ! itemsToKeep.get(currentIndex) ) {
|
||||
iter.remove();
|
||||
}
|
||||
|
||||
currentIndex++;
|
||||
}
|
||||
}
|
||||
// If it's not a linked list, it's more efficient to copy the desired items into a new list and back rather
|
||||
// than suffer O(n^2) of item shifting
|
||||
else {
|
||||
List<E> keptItems = new ArrayList<E>(numItemsToKeep);
|
||||
|
||||
for ( E item : group ) {
|
||||
if ( itemsToKeep.get(currentIndex) ) {
|
||||
keptItems.add(item);
|
||||
}
|
||||
currentIndex++;
|
||||
}
|
||||
group.clear();
|
||||
group.addAll(keptItems);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,202 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMRecordComparator;
|
||||
import net.sf.samtools.SAMRecordCoordinateComparator;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* StingSAMIterator wrapper around our generic reads downsampler interface
|
||||
* that downsamples reads for each sample independently, and then re-assembles
|
||||
* the reads back into a single merged stream.
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class PerSampleDownsamplingReadsIterator implements StingSAMIterator {
|
||||
|
||||
private StingSAMIterator nestedSAMIterator;
|
||||
private ReadsDownsamplerFactory<SAMRecord> downsamplerFactory;
|
||||
private Map<String, ReadsDownsampler<SAMRecord>> perSampleDownsamplers;
|
||||
private PriorityQueue<SAMRecord> orderedDownsampledReadsCache;
|
||||
private SAMRecord nextRead = null;
|
||||
private SAMRecordComparator readComparator = new SAMRecordCoordinateComparator();
|
||||
private SAMRecord earliestPendingRead = null;
|
||||
private ReadsDownsampler<SAMRecord> earliestPendingDownsampler = null;
|
||||
|
||||
// Initial size of our cache of finalized reads
|
||||
private static final int DOWNSAMPLED_READS_INITIAL_CACHE_SIZE = 4096;
|
||||
|
||||
// The number of positional changes that can occur in the read stream before all downsamplers
|
||||
// should be informed of the current position (guards against samples with relatively sparse reads
|
||||
// getting stuck in a pending state):
|
||||
private static final int DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL = 3; // TODO: experiment with this value
|
||||
|
||||
/**
|
||||
* @param iter wrapped iterator from which this iterator will pull reads
|
||||
* @param downsamplerFactory factory used to create new downsamplers as needed
|
||||
*/
|
||||
public PerSampleDownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsamplerFactory<SAMRecord> downsamplerFactory ) {
|
||||
nestedSAMIterator = iter;
|
||||
this.downsamplerFactory = downsamplerFactory;
|
||||
perSampleDownsamplers = new HashMap<String, ReadsDownsampler<SAMRecord>>();
|
||||
orderedDownsampledReadsCache = new PriorityQueue<SAMRecord>(DOWNSAMPLED_READS_INITIAL_CACHE_SIZE, readComparator);
|
||||
|
||||
advanceToNextRead();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return nextRead != null;
|
||||
}
|
||||
|
||||
public SAMRecord next() {
|
||||
if ( nextRead == null ) {
|
||||
throw new NoSuchElementException("next() called when there are no more items");
|
||||
}
|
||||
|
||||
SAMRecord toReturn = nextRead;
|
||||
advanceToNextRead();
|
||||
|
||||
return toReturn;
|
||||
}
|
||||
|
||||
private void advanceToNextRead() {
|
||||
if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) {
|
||||
nextRead = null;
|
||||
}
|
||||
else {
|
||||
nextRead = orderedDownsampledReadsCache.poll();
|
||||
}
|
||||
}
|
||||
|
||||
private boolean readyToReleaseReads() {
|
||||
if ( orderedDownsampledReadsCache.isEmpty() ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return earliestPendingRead == null ||
|
||||
readComparator.compare(orderedDownsampledReadsCache.peek(), earliestPendingRead) <= 0;
|
||||
}
|
||||
|
||||
private void updateEarliestPendingRead( ReadsDownsampler<SAMRecord> currentDownsampler ) {
|
||||
// If there is no recorded earliest pending read and this downsampler has pending items,
|
||||
// then this downsampler's first pending item becomes the new earliest pending read:
|
||||
if ( earliestPendingRead == null && currentDownsampler.hasPendingItems() ) {
|
||||
earliestPendingRead = currentDownsampler.peekPending();
|
||||
earliestPendingDownsampler = currentDownsampler;
|
||||
}
|
||||
// In all other cases, we only need to update the earliest pending read when the downsampler
|
||||
// associated with it experiences a change in its pending reads, since by assuming a sorted
|
||||
// read stream we're assured that each downsampler's earliest pending read will only increase
|
||||
// in genomic position over time.
|
||||
//
|
||||
// TODO: An occasional O(samples) linear search seems like a better option than keeping the downsamplers
|
||||
// TODO: sorted by earliest pending read, which would cost at least O(total_reads * (samples + log(samples))),
|
||||
// TODO: but need to verify this empirically.
|
||||
else if ( currentDownsampler == earliestPendingDownsampler &&
|
||||
(! currentDownsampler.hasPendingItems() || readComparator.compare(currentDownsampler.peekPending(), earliestPendingRead) != 0) ) {
|
||||
|
||||
earliestPendingRead = null;
|
||||
earliestPendingDownsampler = null;
|
||||
for ( ReadsDownsampler<SAMRecord> perSampleDownsampler : perSampleDownsamplers.values() ) {
|
||||
if ( perSampleDownsampler.hasPendingItems() &&
|
||||
(earliestPendingRead == null || readComparator.compare(perSampleDownsampler.peekPending(), earliestPendingRead) < 0) ) {
|
||||
|
||||
earliestPendingRead = perSampleDownsampler.peekPending();
|
||||
earliestPendingDownsampler = perSampleDownsampler;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean fillDownsampledReadsCache() {
|
||||
SAMRecord prevRead = null;
|
||||
int numPositionalChanges = 0;
|
||||
|
||||
// Continue submitting reads to the per-sample downsamplers until the read at the top of the priority queue
|
||||
// can be released without violating global sort order
|
||||
while ( nestedSAMIterator.hasNext() && ! readyToReleaseReads() ) {
|
||||
SAMRecord read = nestedSAMIterator.next();
|
||||
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
|
||||
|
||||
ReadsDownsampler<SAMRecord> thisSampleDownsampler = perSampleDownsamplers.get(sampleName);
|
||||
if ( thisSampleDownsampler == null ) {
|
||||
thisSampleDownsampler = downsamplerFactory.newInstance();
|
||||
perSampleDownsamplers.put(sampleName, thisSampleDownsampler);
|
||||
}
|
||||
|
||||
thisSampleDownsampler.submit(read);
|
||||
updateEarliestPendingRead(thisSampleDownsampler);
|
||||
|
||||
if ( prevRead != null && prevRead.getAlignmentStart() != read.getAlignmentStart() ) {
|
||||
numPositionalChanges++;
|
||||
}
|
||||
|
||||
// If the number of times we've changed position exceeds a certain threshold, inform all
|
||||
// downsamplers of the current position in the read stream. This is to prevent downsamplers
|
||||
// for samples with sparser reads than others from getting stuck too long in a pending state.
|
||||
if ( numPositionalChanges > DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL ) {
|
||||
for ( ReadsDownsampler<SAMRecord> perSampleDownsampler : perSampleDownsamplers.values() ) {
|
||||
perSampleDownsampler.signalNoMoreReadsBefore(read);
|
||||
updateEarliestPendingRead(perSampleDownsampler);
|
||||
}
|
||||
}
|
||||
|
||||
prevRead = read;
|
||||
}
|
||||
|
||||
if ( ! nestedSAMIterator.hasNext() ) {
|
||||
for ( ReadsDownsampler<SAMRecord> perSampleDownsampler : perSampleDownsamplers.values() ) {
|
||||
perSampleDownsampler.signalEndOfInput();
|
||||
}
|
||||
earliestPendingRead = null;
|
||||
earliestPendingDownsampler = null;
|
||||
}
|
||||
|
||||
for ( ReadsDownsampler<SAMRecord> perSampleDownsampler : perSampleDownsamplers.values() ) {
|
||||
if ( perSampleDownsampler.hasFinalizedItems() ) {
|
||||
orderedDownsampledReadsCache.addAll(perSampleDownsampler.consumeFinalizedItems());
|
||||
}
|
||||
}
|
||||
|
||||
return readyToReleaseReads();
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
|
||||
}
|
||||
|
||||
public void close() {
|
||||
nestedSAMIterator.close();
|
||||
}
|
||||
|
||||
public Iterator<SAMRecord> iterator() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,259 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Positional Downsampler: When eliminating reads, try to do so evenly based on the alignment start positions
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class PositionalDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
|
||||
|
||||
private int targetCoverage;
|
||||
|
||||
private ReservoirDownsampler<T> reservoir;
|
||||
|
||||
private int currentContigIndex;
|
||||
|
||||
private int currentAlignmentStart;
|
||||
|
||||
private LinkedList<PositionalReadGrouping> pendingReads;
|
||||
|
||||
private ArrayList<T> finalizedReads;
|
||||
|
||||
public PositionalDownsampler ( int targetCoverage ) {
|
||||
this.targetCoverage = targetCoverage;
|
||||
clear();
|
||||
}
|
||||
|
||||
public void submit ( T newRead ) {
|
||||
if ( readIsPastCurrentPosition(newRead) ) {
|
||||
updateAndDownsamplePendingReads();
|
||||
}
|
||||
|
||||
reservoir.submit(newRead);
|
||||
updateCurrentPosition(newRead);
|
||||
}
|
||||
|
||||
public void submit ( Collection<T> newReads ) {
|
||||
for ( T read : newReads ) {
|
||||
submit(read);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasDownsampledItems() {
|
||||
return finalizedReads.size() > 0;
|
||||
}
|
||||
|
||||
public List<T> consumeDownsampledItems() {
|
||||
List<T> toReturn = finalizedReads;
|
||||
finalizedReads = new ArrayList<T>();
|
||||
return toReturn;
|
||||
}
|
||||
|
||||
public boolean hasPendingItems() {
|
||||
return pendingReads.size() > 0;
|
||||
}
|
||||
|
||||
public void signalEndOfInput() {
|
||||
updateAndDownsamplePendingReads();
|
||||
|
||||
for ( PositionalReadGrouping group : pendingReads ) {
|
||||
group.finalizeAllActiveReads();
|
||||
finalizedReads.addAll(group.getFinalizedReads());
|
||||
}
|
||||
|
||||
pendingReads.clear();
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
reservoir = new ReservoirDownsampler<T>(targetCoverage);
|
||||
pendingReads = new LinkedList<PositionalReadGrouping>();
|
||||
finalizedReads = new ArrayList<T>();
|
||||
}
|
||||
|
||||
public boolean requiresCoordinateSortOrder() {
|
||||
return true;
|
||||
}
|
||||
|
||||
private void updateCurrentPosition ( T read ) {
|
||||
currentContigIndex = read.getReferenceIndex();
|
||||
currentAlignmentStart = read.getAlignmentStart();
|
||||
}
|
||||
|
||||
private boolean readIsPastCurrentPosition ( T read ) {
|
||||
return read.getReferenceIndex() != currentContigIndex || read.getAlignmentStart() > currentAlignmentStart;
|
||||
}
|
||||
|
||||
private void updateAndDownsamplePendingReads() {
|
||||
finalizeOutOfScopeReads();
|
||||
|
||||
List<T> oldLocusReads = reservoir.consumeDownsampledItems();
|
||||
pendingReads.add(new PositionalReadGrouping(oldLocusReads, currentContigIndex, currentAlignmentStart));
|
||||
|
||||
downsampleOverlappingGroups();
|
||||
}
|
||||
|
||||
private void finalizeOutOfScopeReads() {
|
||||
Iterator<PositionalReadGrouping> iter = pendingReads.iterator();
|
||||
boolean noPrecedingUnfinalizedGroups = true;
|
||||
|
||||
while ( iter.hasNext() ) {
|
||||
PositionalReadGrouping currentGroup = iter.next();
|
||||
currentGroup.finalizeActiveReadsBeforePosition(currentContigIndex, currentAlignmentStart);
|
||||
|
||||
if ( currentGroup.isFinalized() && noPrecedingUnfinalizedGroups ) {
|
||||
iter.remove();
|
||||
finalizedReads.addAll(currentGroup.getFinalizedReads());
|
||||
}
|
||||
else {
|
||||
noPrecedingUnfinalizedGroups = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void downsampleOverlappingGroups() {
|
||||
int[] groupReadCounts = new int[pendingReads.size()];
|
||||
int totalCoverage = 0;
|
||||
int numActiveGroups = 0;
|
||||
int currentGroup = 0;
|
||||
|
||||
for ( PositionalReadGrouping group : pendingReads ) {
|
||||
groupReadCounts[currentGroup] = group.numActiveReads();
|
||||
totalCoverage += groupReadCounts[currentGroup];
|
||||
|
||||
if ( groupReadCounts[currentGroup] > 0 ) {
|
||||
numActiveGroups++;
|
||||
}
|
||||
|
||||
currentGroup++;
|
||||
}
|
||||
|
||||
if ( totalCoverage <= targetCoverage ) {
|
||||
return;
|
||||
}
|
||||
|
||||
int numReadsToRemove = Math.min(totalCoverage - targetCoverage, totalCoverage - numActiveGroups);
|
||||
currentGroup = 0;
|
||||
|
||||
while ( numReadsToRemove > 0 ) {
|
||||
if ( groupReadCounts[currentGroup] > 1 ) {
|
||||
groupReadCounts[currentGroup]--;
|
||||
numReadsToRemove--;
|
||||
}
|
||||
|
||||
currentGroup = (currentGroup + 1) % groupReadCounts.length;
|
||||
}
|
||||
|
||||
currentGroup = 0;
|
||||
for ( PositionalReadGrouping group : pendingReads ) {
|
||||
if ( ! group.isFinalized() ) {
|
||||
group.downsampleActiveReads(groupReadCounts[currentGroup]);
|
||||
}
|
||||
currentGroup++;
|
||||
}
|
||||
}
|
||||
|
||||
private class PositionalReadGrouping {
|
||||
private List<T> activeReads;
|
||||
private List<T> finalizedReads;
|
||||
|
||||
private int contig;
|
||||
private int alignmentStart;
|
||||
|
||||
public PositionalReadGrouping( Collection<T> reads, int contig, int alignmentStart ) {
|
||||
activeReads = new LinkedList<T>(reads);
|
||||
finalizedReads = new ArrayList<T>();
|
||||
this.contig = contig;
|
||||
this.alignmentStart = alignmentStart;
|
||||
}
|
||||
|
||||
public int numActiveReads() {
|
||||
return activeReads.size();
|
||||
}
|
||||
|
||||
public boolean isFinalized() {
|
||||
return activeReads.size() == 0;
|
||||
}
|
||||
|
||||
public List<T> getFinalizedReads() {
|
||||
return finalizedReads;
|
||||
}
|
||||
|
||||
public void finalizeActiveReadsBeforePosition( int contig, int position ) {
|
||||
if ( this.contig != contig ) {
|
||||
finalizeAllActiveReads();
|
||||
return;
|
||||
}
|
||||
|
||||
Iterator<T> iter = activeReads.iterator();
|
||||
|
||||
while ( iter.hasNext() ) {
|
||||
T read = iter.next();
|
||||
if ( read.getAlignmentEnd() < position ) {
|
||||
iter.remove();
|
||||
finalizedReads.add(read);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void finalizeAllActiveReads() {
|
||||
finalizedReads.addAll(activeReads);
|
||||
activeReads.clear();
|
||||
}
|
||||
|
||||
public void downsampleActiveReads( int numReadsToKeep ) {
|
||||
if ( numReadsToKeep > activeReads.size() || numReadsToKeep < 0 ) {
|
||||
throw new ReviewedStingException(String.format("Cannot retain %d reads out of %d total reads",
|
||||
numReadsToKeep, activeReads.size()));
|
||||
}
|
||||
|
||||
BitSet itemsToKeep = new BitSet(activeReads.size());
|
||||
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(activeReads.size(), numReadsToKeep) ) {
|
||||
itemsToKeep.set(selectedIndex);
|
||||
}
|
||||
|
||||
int currentIndex = 0;
|
||||
Iterator<T> iter = activeReads.iterator();
|
||||
|
||||
while ( iter.hasNext() ) {
|
||||
T read = iter.next();
|
||||
|
||||
if ( ! itemsToKeep.get(currentIndex) ) {
|
||||
iter.remove();
|
||||
}
|
||||
|
||||
currentIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -33,8 +33,23 @@ import net.sf.samtools.SAMRecord;
|
|||
*/
|
||||
public interface ReadsDownsampler<T extends SAMRecord> extends Downsampler<T> {
|
||||
|
||||
/*
|
||||
/**
|
||||
* Does this downsampler require that reads be fed to it in coordinate order?
|
||||
*
|
||||
* @return true if reads must be submitted to this downsampler in coordinate order, otherwise false
|
||||
*/
|
||||
public boolean requiresCoordinateSortOrder();
|
||||
|
||||
/**
|
||||
* Tell this downsampler that no more reads located before the provided read (according to
|
||||
* the sort order of the read stream) will be fed to it.
|
||||
*
|
||||
* Allows position-aware downsamplers to finalize pending reads earlier than they would
|
||||
* otherwise be able to, particularly when doing per-sample downsampling and reads for
|
||||
* certain samples are sparser than average.
|
||||
*
|
||||
* @param read the downsampler will assume that no reads located before this read will ever
|
||||
* be submitted to it in the future
|
||||
*/
|
||||
public void signalNoMoreReadsBefore( T read );
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,37 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* A ReadsDownsamplerFactory can be used to create an arbitrary number of instances of a particular
|
||||
* downsampler, all sharing the same construction parameters.
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public interface ReadsDownsamplerFactory<T extends SAMRecord> {
|
||||
public ReadsDownsampler<T> newInstance();
|
||||
}
|
||||
|
|
@ -48,6 +48,14 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
|
|||
|
||||
private int totalReadsSeen;
|
||||
|
||||
private int numDiscardedItems;
|
||||
|
||||
/**
|
||||
* Construct a ReservoirDownsampler
|
||||
*
|
||||
* @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained
|
||||
* after downsampling will be min(totalReads, targetSampleSize)
|
||||
*/
|
||||
public ReservoirDownsampler ( int targetSampleSize ) {
|
||||
if ( targetSampleSize <= 0 ) {
|
||||
throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0");
|
||||
|
|
@ -55,6 +63,7 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
|
|||
|
||||
this.targetSampleSize = targetSampleSize;
|
||||
clear();
|
||||
reset();
|
||||
}
|
||||
|
||||
public void submit ( T newRead ) {
|
||||
|
|
@ -68,6 +77,7 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
|
|||
if ( randomSlot < targetSampleSize ) {
|
||||
reservoir.set(randomSlot, newRead);
|
||||
}
|
||||
numDiscardedItems++;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -77,11 +87,12 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
|
|||
}
|
||||
}
|
||||
|
||||
public boolean hasDownsampledItems() {
|
||||
public boolean hasFinalizedItems() {
|
||||
return reservoir.size() > 0;
|
||||
}
|
||||
|
||||
public List<T> consumeDownsampledItems() {
|
||||
public List<T> consumeFinalizedItems() {
|
||||
// pass by reference rather than make a copy, for speed
|
||||
List<T> downsampledItems = reservoir;
|
||||
clear();
|
||||
return downsampledItems;
|
||||
|
|
@ -91,16 +102,36 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
|
|||
return false;
|
||||
}
|
||||
|
||||
public T peekFinalized() {
|
||||
return reservoir.isEmpty() ? null : reservoir.get(0);
|
||||
}
|
||||
|
||||
public T peekPending() {
|
||||
return null;
|
||||
}
|
||||
|
||||
public int getNumberOfDiscardedItems() {
|
||||
return numDiscardedItems;
|
||||
}
|
||||
|
||||
public void signalEndOfInput() {
|
||||
// NO-OP
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
reservoir = new ArrayList<T>(targetSampleSize);
|
||||
totalReadsSeen = 0;
|
||||
totalReadsSeen = 0; // an internal stat used by the downsampling process, so not cleared by reset() below
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
numDiscardedItems = 0;
|
||||
}
|
||||
|
||||
public boolean requiresCoordinateSortOrder() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public void signalNoMoreReadsBefore( T read ) {
|
||||
// NO-OP
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,45 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Factory for creating ReservoirDownsamplers on demand
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class ReservoirDownsamplerFactory<T extends SAMRecord> implements ReadsDownsamplerFactory<T> {
|
||||
|
||||
private int targetSampleSize;
|
||||
|
||||
public ReservoirDownsamplerFactory( int targetSampleSize ) {
|
||||
this.targetSampleSize = targetSampleSize;
|
||||
}
|
||||
|
||||
public ReadsDownsampler<T> newInstance() {
|
||||
return new ReservoirDownsampler<T>(targetSampleSize);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,169 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Simple Positional Downsampler: Downsample each stack of reads at each alignment start to a size <= a target coverage
|
||||
* using a Reservoir downsampler. Stores only O(target coverage) reads in memory at any given time.
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class SimplePositionalDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
|
||||
|
||||
private int targetCoverage;
|
||||
|
||||
private ReservoirDownsampler<T> reservoir;
|
||||
|
||||
private int currentContigIndex;
|
||||
|
||||
private int currentAlignmentStart;
|
||||
|
||||
private boolean positionEstablished;
|
||||
|
||||
private boolean unmappedReadsReached;
|
||||
|
||||
private ArrayList<T> finalizedReads;
|
||||
|
||||
private int numDiscardedItems;
|
||||
|
||||
/**
|
||||
* Construct a SimplePositionalDownsampler
|
||||
*
|
||||
* @param targetCoverage Maximum number of reads that may share any given alignment start position
|
||||
*/
|
||||
public SimplePositionalDownsampler( int targetCoverage ) {
|
||||
this.targetCoverage = targetCoverage;
|
||||
reservoir = new ReservoirDownsampler<T>(targetCoverage);
|
||||
finalizedReads = new ArrayList<T>();
|
||||
clear();
|
||||
reset();
|
||||
}
|
||||
|
||||
public void submit( T newRead ) {
|
||||
updatePositionalState(newRead);
|
||||
|
||||
if ( unmappedReadsReached ) { // don't downsample the unmapped reads at the end of the stream
|
||||
finalizedReads.add(newRead);
|
||||
}
|
||||
else {
|
||||
int reservoirPreviouslyDiscardedItems = reservoir.getNumberOfDiscardedItems();
|
||||
reservoir.submit(newRead);
|
||||
numDiscardedItems += reservoir.getNumberOfDiscardedItems() - reservoirPreviouslyDiscardedItems;
|
||||
}
|
||||
}
|
||||
|
||||
public void submit( Collection<T> newReads ) {
|
||||
for ( T read : newReads ) {
|
||||
submit(read);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasFinalizedItems() {
|
||||
return finalizedReads.size() > 0;
|
||||
}
|
||||
|
||||
public List<T> consumeFinalizedItems() {
|
||||
// pass by reference rather than make a copy, for speed
|
||||
List<T> toReturn = finalizedReads;
|
||||
finalizedReads = new ArrayList<T>();
|
||||
return toReturn;
|
||||
}
|
||||
|
||||
public boolean hasPendingItems() {
|
||||
return reservoir.hasFinalizedItems();
|
||||
}
|
||||
|
||||
public T peekFinalized() {
|
||||
return finalizedReads.isEmpty() ? null : finalizedReads.get(0);
|
||||
}
|
||||
|
||||
public T peekPending() {
|
||||
return reservoir.peekFinalized();
|
||||
}
|
||||
|
||||
public int getNumberOfDiscardedItems() {
|
||||
return numDiscardedItems;
|
||||
}
|
||||
|
||||
public void signalEndOfInput() {
|
||||
finalizeReservoir();
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
reservoir.clear();
|
||||
reservoir.reset();
|
||||
finalizedReads.clear();
|
||||
positionEstablished = false;
|
||||
unmappedReadsReached = false;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
numDiscardedItems = 0;
|
||||
}
|
||||
|
||||
public boolean requiresCoordinateSortOrder() {
|
||||
return true;
|
||||
}
|
||||
|
||||
public void signalNoMoreReadsBefore( T read ) {
|
||||
updatePositionalState(read);
|
||||
}
|
||||
|
||||
private void updatePositionalState( T newRead ) {
|
||||
if ( readIsPastCurrentPosition(newRead) ) {
|
||||
if ( reservoir.hasFinalizedItems() ) {
|
||||
finalizeReservoir();
|
||||
}
|
||||
|
||||
setCurrentPosition(newRead);
|
||||
|
||||
if ( newRead.getReadUnmappedFlag() ) {
|
||||
unmappedReadsReached = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void setCurrentPosition( T read ) {
|
||||
currentContigIndex = read.getReferenceIndex();
|
||||
currentAlignmentStart = read.getAlignmentStart();
|
||||
positionEstablished = true;
|
||||
}
|
||||
|
||||
private boolean readIsPastCurrentPosition( T read ) {
|
||||
return ! positionEstablished ||
|
||||
read.getReferenceIndex() > currentContigIndex ||
|
||||
read.getAlignmentStart() > currentAlignmentStart ||
|
||||
(read.getReadUnmappedFlag() && ! unmappedReadsReached);
|
||||
}
|
||||
|
||||
private void finalizeReservoir() {
|
||||
finalizedReads.addAll(reservoir.consumeFinalizedItems());
|
||||
reservoir.reset();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,45 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Factory for creating SimplePositionalDownsamplers on demand
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class SimplePositionalDownsamplerFactory<T extends SAMRecord> implements ReadsDownsamplerFactory<T> {
|
||||
|
||||
private int targetCoverage;
|
||||
|
||||
public SimplePositionalDownsamplerFactory( int targetCoverage ) {
|
||||
this.targetCoverage = targetCoverage;
|
||||
}
|
||||
|
||||
public ReadsDownsampler<T> newInstance() {
|
||||
return new SimplePositionalDownsampler<T>(targetCoverage);
|
||||
}
|
||||
}
|
||||
|
|
@ -8,9 +8,11 @@ import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
|||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||
import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker;
|
||||
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
||||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.threading.EfficiencyMonitoringThreadFactory;
|
||||
import org.broadinstitute.sting.utils.threading.ThreadPoolMonitor;
|
||||
|
||||
import java.util.Collection;
|
||||
|
|
@ -75,14 +77,27 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
|
|||
/**
|
||||
* Create a new hierarchical microscheduler to process the given reads and reference.
|
||||
*
|
||||
* @param walker the walker used to process the dataset.
|
||||
* @param reads Reads file(s) to process.
|
||||
* @param reference Reference for driving the traversal.
|
||||
* @param nThreadsToUse maximum number of threads to use to do the work
|
||||
* @param walker the walker used to process the dataset.
|
||||
* @param reads Reads file(s) to process.
|
||||
* @param reference Reference for driving the traversal.
|
||||
* @param threadAllocation How should we apply multi-threaded execution?
|
||||
*/
|
||||
protected HierarchicalMicroScheduler(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods, int nThreadsToUse ) {
|
||||
super(engine, walker, reads, reference, rods);
|
||||
this.threadPool = Executors.newFixedThreadPool(nThreadsToUse);
|
||||
protected HierarchicalMicroScheduler(final GenomeAnalysisEngine engine,
|
||||
final Walker walker,
|
||||
final SAMDataSource reads,
|
||||
final IndexedFastaSequenceFile reference,
|
||||
final Collection<ReferenceOrderedDataSource> rods,
|
||||
final ThreadAllocation threadAllocation) {
|
||||
super(engine, walker, reads, reference, rods, threadAllocation);
|
||||
|
||||
final int nThreadsToUse = threadAllocation.getNumDataThreads();
|
||||
if ( threadAllocation.monitorThreadEfficiency() ) {
|
||||
final EfficiencyMonitoringThreadFactory monitoringThreadFactory = new EfficiencyMonitoringThreadFactory(nThreadsToUse);
|
||||
setThreadEfficiencyMonitor(monitoringThreadFactory);
|
||||
this.threadPool = Executors.newFixedThreadPool(nThreadsToUse, monitoringThreadFactory);
|
||||
} else {
|
||||
this.threadPool = Executors.newFixedThreadPool(nThreadsToUse);
|
||||
}
|
||||
}
|
||||
|
||||
public Object execute( Walker walker, Iterable<Shard> shardStrategy ) {
|
||||
|
|
@ -140,6 +155,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
|
|||
// do final cleanup operations
|
||||
outputTracker.close();
|
||||
cleanup();
|
||||
executionIsDone();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,9 +10,11 @@ import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
|||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
|
||||
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraverseActiveRegions;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
|
|
@ -33,8 +35,16 @@ public class LinearMicroScheduler extends MicroScheduler {
|
|||
* @param reference Reference for driving the traversal.
|
||||
* @param rods Reference-ordered data.
|
||||
*/
|
||||
protected LinearMicroScheduler(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods ) {
|
||||
super(engine, walker, reads, reference, rods);
|
||||
protected LinearMicroScheduler(final GenomeAnalysisEngine engine,
|
||||
final Walker walker,
|
||||
final SAMDataSource reads,
|
||||
final IndexedFastaSequenceFile reference,
|
||||
final Collection<ReferenceOrderedDataSource> rods,
|
||||
final ThreadAllocation threadAllocation) {
|
||||
super(engine, walker, reads, reference, rods, threadAllocation);
|
||||
|
||||
if ( threadAllocation.monitorThreadEfficiency() )
|
||||
setThreadEfficiencyMonitor(new ThreadEfficiencyMonitor());
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -49,11 +59,12 @@ public class LinearMicroScheduler extends MicroScheduler {
|
|||
|
||||
boolean done = walker.isDone();
|
||||
int counter = 0;
|
||||
|
||||
traversalEngine.startTimersIfNecessary();
|
||||
for (Shard shard : shardStrategy ) {
|
||||
if ( done || shard == null ) // we ran out of shards that aren't owned
|
||||
break;
|
||||
|
||||
traversalEngine.startTimersIfNecessary();
|
||||
if(shard.getShardType() == Shard.ShardType.LOCUS) {
|
||||
WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(),
|
||||
getReadIterator(shard), shard.getGenomeLocs(), SampleUtils.getSAMFileSamples(engine));
|
||||
|
|
@ -88,6 +99,7 @@ public class LinearMicroScheduler extends MicroScheduler {
|
|||
|
||||
outputTracker.close();
|
||||
cleanup();
|
||||
executionIsDone();
|
||||
|
||||
return accumulator;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -39,6 +39,7 @@ import org.broadinstitute.sting.gatk.traversals.*;
|
|||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor;
|
||||
|
||||
import javax.management.JMException;
|
||||
import javax.management.MBeanServer;
|
||||
|
|
@ -58,6 +59,8 @@ import java.util.Collection;
|
|||
|
||||
/** Shards and schedules data in manageable chunks. */
|
||||
public abstract class MicroScheduler implements MicroSchedulerMBean {
|
||||
// TODO -- remove me and retire non nano scheduled versions of traversals
|
||||
private final static boolean USE_NANOSCHEDULER_FOR_EVERYTHING = true;
|
||||
protected static final Logger logger = Logger.getLogger(MicroScheduler.class);
|
||||
|
||||
/**
|
||||
|
|
@ -79,6 +82,13 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
|
|||
private final MBeanServer mBeanServer;
|
||||
private final ObjectName mBeanName;
|
||||
|
||||
/**
|
||||
* Threading efficiency monitor for tracking the resource utilization of the GATK
|
||||
*
|
||||
* may be null
|
||||
*/
|
||||
ThreadEfficiencyMonitor threadEfficiencyMonitor = null;
|
||||
|
||||
/**
|
||||
* MicroScheduler factory function. Create a microscheduler appropriate for reducing the
|
||||
* selected walker.
|
||||
|
|
@ -92,18 +102,36 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
|
|||
* @return The best-fit microscheduler.
|
||||
*/
|
||||
public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods, ThreadAllocation threadAllocation) {
|
||||
if (walker instanceof TreeReducible && threadAllocation.getNumCPUThreads() > 1) {
|
||||
if(walker.isReduceByInterval())
|
||||
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
|
||||
if(walker instanceof ReadWalker)
|
||||
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s is a read walker. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
|
||||
logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",threadAllocation.getNumCPUThreads()));
|
||||
return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads());
|
||||
} else {
|
||||
if(threadAllocation.getNumCPUThreads() > 1)
|
||||
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
|
||||
return new LinearMicroScheduler(engine, walker, reads, reference, rods);
|
||||
if ( threadAllocation.isRunningInParallelMode() ) {
|
||||
// TODO -- remove me when we fix running NCT within HMS
|
||||
if ( threadAllocation.getNumDataThreads() > 1 && threadAllocation.getNumCPUThreadsPerDataThread() > 1)
|
||||
throw new UserException("Currently the GATK does not support running CPU threads within data threads, " +
|
||||
"please specify only one of NT and NCT");
|
||||
|
||||
logger.info(String.format("Running the GATK in parallel mode with %d CPU thread(s) for each of %d data thread(s)",
|
||||
threadAllocation.getNumCPUThreadsPerDataThread(), threadAllocation.getNumDataThreads()));
|
||||
}
|
||||
|
||||
if ( threadAllocation.getNumDataThreads() > 1 ) {
|
||||
if (walker.isReduceByInterval())
|
||||
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
|
||||
|
||||
if ( ! (walker instanceof TreeReducible) ) {
|
||||
throw badNT("nt", engine, walker);
|
||||
} else {
|
||||
return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation);
|
||||
}
|
||||
} else {
|
||||
if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof NanoSchedulable) )
|
||||
throw badNT("nct", engine, walker);
|
||||
return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation);
|
||||
}
|
||||
}
|
||||
|
||||
private static UserException badNT(final String parallelArg, final GenomeAnalysisEngine engine, final Walker walker) {
|
||||
throw new UserException.BadArgumentValue("nt",
|
||||
String.format("The analysis %s currently does not support parallel execution with %s. " +
|
||||
"Please run your analysis without the %s option.", engine.getWalkerName(walker.getClass()), parallelArg, parallelArg));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -113,17 +141,27 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
|
|||
* @param reads The reads.
|
||||
* @param reference The reference.
|
||||
* @param rods the rods to include in the traversal
|
||||
* @param threadAllocation the allocation of threads to use in the underlying traversal
|
||||
*/
|
||||
protected MicroScheduler(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods) {
|
||||
protected MicroScheduler(final GenomeAnalysisEngine engine,
|
||||
final Walker walker,
|
||||
final SAMDataSource reads,
|
||||
final IndexedFastaSequenceFile reference,
|
||||
final Collection<ReferenceOrderedDataSource> rods,
|
||||
final ThreadAllocation threadAllocation) {
|
||||
this.engine = engine;
|
||||
this.reads = reads;
|
||||
this.reference = reference;
|
||||
this.rods = rods;
|
||||
|
||||
if (walker instanceof ReadWalker) {
|
||||
traversalEngine = new TraverseReads();
|
||||
traversalEngine = USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1
|
||||
? new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread())
|
||||
: new TraverseReads();
|
||||
} else if (walker instanceof LocusWalker) {
|
||||
traversalEngine = new TraverseLoci();
|
||||
traversalEngine = USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1
|
||||
? new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread())
|
||||
: new TraverseLociLinear();
|
||||
} else if (walker instanceof DuplicateWalker) {
|
||||
traversalEngine = new TraverseDuplicates();
|
||||
} else if (walker instanceof ReadPairWalker) {
|
||||
|
|
@ -150,6 +188,24 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the ThreadEfficiencyMonitor we are using to track our resource utilization, if there is one
|
||||
*
|
||||
* @return the monitor, or null if none is active
|
||||
*/
|
||||
public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() {
|
||||
return threadEfficiencyMonitor;
|
||||
}
|
||||
|
||||
/**
|
||||
* Inform this Microscheduler to use the efficiency monitor used to create threads in subclasses
|
||||
*
|
||||
* @param threadEfficiencyMonitor
|
||||
*/
|
||||
public void setThreadEfficiencyMonitor(final ThreadEfficiencyMonitor threadEfficiencyMonitor) {
|
||||
this.threadEfficiencyMonitor = threadEfficiencyMonitor;
|
||||
}
|
||||
|
||||
/**
|
||||
* Walks a walker over the given list of intervals.
|
||||
*
|
||||
|
|
@ -183,6 +239,18 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
|
|||
traversalEngine.printOnTraversalDone();
|
||||
}
|
||||
|
||||
/**
|
||||
* Must be called by subclasses when execute is done
|
||||
*/
|
||||
protected void executionIsDone() {
|
||||
// Print out the threading efficiency of this HMS, if state monitoring is enabled
|
||||
if ( threadEfficiencyMonitor != null ) {
|
||||
// include the master thread information
|
||||
threadEfficiencyMonitor.threadIsDone(Thread.currentThread());
|
||||
threadEfficiencyMonitor.printUsageInformation(logger);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the engine that created this microscheduler.
|
||||
* @return The engine owning this microscheduler.
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|||
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
||||
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState;
|
||||
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByStateExperimental;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
|
@ -81,7 +82,13 @@ public class WindowMaker implements Iterable<WindowMaker.WindowMakerIterator>, I
|
|||
public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List<GenomeLoc> intervals, Collection<String> sampleNames) {
|
||||
this.sourceInfo = shard.getReadProperties();
|
||||
this.readIterator = iterator;
|
||||
this.sourceIterator = new PeekableIterator<AlignmentContext>(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames));
|
||||
|
||||
// Temporary: use the experimental version of LocusIteratorByState if experimental downsampling was requested:
|
||||
this.sourceIterator = sourceInfo.getDownsamplingMethod().useExperimentalDownsampling ?
|
||||
new PeekableIterator<AlignmentContext>(new LocusIteratorByStateExperimental(iterator,sourceInfo,genomeLocParser, sampleNames))
|
||||
:
|
||||
new PeekableIterator<AlignmentContext>(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames));
|
||||
|
||||
this.intervalIterator = intervals.size()>0 ? new PeekableIterator<GenomeLoc>(intervals.iterator()) : null;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -25,9 +25,14 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.filters;
|
||||
|
||||
import com.google.common.base.Function;
|
||||
import com.google.common.collect.Collections2;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.help.GATKDocUtils;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Manage filters and filter options. Any requests for basic filtering classes
|
||||
|
|
@ -54,4 +59,39 @@ public class FilterManager extends PluginManager<ReadFilter> {
|
|||
public Collection<Class<? extends ReadFilter>> getValues() {
|
||||
return this.getPlugins();
|
||||
}
|
||||
|
||||
/**
|
||||
* Rather than use the default error message, print out a list of read filters as well.
|
||||
* @param pluginCategory - string, the category of the plugin (e.g. read filter)
|
||||
* @param pluginName - string, what we were trying to match (but failed to)
|
||||
* @return - A wall of text with the default message, followed by a listing of available read filters
|
||||
*/
|
||||
@Override
|
||||
protected String formatErrorMessage(String pluginCategory, String pluginName) {
|
||||
List<Class<? extends ReadFilter>> availableFilters = this.getPluginsImplementing(ReadFilter.class);
|
||||
|
||||
|
||||
return String.format("Read filter %s not found. Available read filters:%n%n%s%n%n%s",pluginName,
|
||||
userFriendlyListofReadFilters(availableFilters),
|
||||
"Please consult the GATK Documentation (http://www.broadinstitute.org/gatk/gatkdocs/) for more information.");
|
||||
}
|
||||
|
||||
private String userFriendlyListofReadFilters(List<Class<? extends ReadFilter>> filters) {
|
||||
final String headName = "FilterName", headDoc = "Documentation";
|
||||
int longestNameLength = -1;
|
||||
for ( Class < ? extends ReadFilter> filter : filters ) {
|
||||
longestNameLength = Math.max(longestNameLength,this.getName(filter).length());
|
||||
}
|
||||
String format = " %"+longestNameLength+"s %s%n";
|
||||
|
||||
StringBuilder listBuilder = new StringBuilder();
|
||||
listBuilder.append(String.format(format,headName,headDoc));
|
||||
for ( Class<? extends ReadFilter> filter : filters ) {
|
||||
String helpLink = GATKDocUtils.helpLinksToGATKDocs(filter);
|
||||
String filterName = this.getName(filter);
|
||||
listBuilder.append(String.format(format,filterName,helpLink));
|
||||
}
|
||||
|
||||
return listBuilder.toString();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -31,12 +31,16 @@ import net.sf.samtools.SAMRecord;
|
|||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.OutputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A stub for routing and management of SAM file reading and writing.
|
||||
|
|
@ -116,15 +120,15 @@ public class SAMFileWriterStub implements Stub<SAMFileWriter>, StingSAMFileWrite
|
|||
*/
|
||||
private boolean simplifyBAM = false;
|
||||
|
||||
private List<ReadTransformer> onOutputReadTransformers = null;
|
||||
|
||||
/**
|
||||
* Create a new stub given the requested SAM file and compression level.
|
||||
* @param engine source of header data, maybe other data about input files.
|
||||
* @param samFile SAM file to (ultimately) create.
|
||||
*/
|
||||
public SAMFileWriterStub( GenomeAnalysisEngine engine, File samFile ) {
|
||||
this.engine = engine;
|
||||
this.samFile = samFile;
|
||||
this.samOutputStream = null;
|
||||
this(engine, samFile, null);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -133,8 +137,12 @@ public class SAMFileWriterStub implements Stub<SAMFileWriter>, StingSAMFileWrite
|
|||
* @param stream Output stream to which data should be written.
|
||||
*/
|
||||
public SAMFileWriterStub( GenomeAnalysisEngine engine, OutputStream stream ) {
|
||||
this(engine, null, stream);
|
||||
}
|
||||
|
||||
private SAMFileWriterStub(final GenomeAnalysisEngine engine, final File samFile, final OutputStream stream) {
|
||||
this.engine = engine;
|
||||
this.samFile = null;
|
||||
this.samFile = samFile;
|
||||
this.samOutputStream = stream;
|
||||
}
|
||||
|
||||
|
|
@ -274,17 +282,29 @@ public class SAMFileWriterStub implements Stub<SAMFileWriter>, StingSAMFileWrite
|
|||
this.headerOverride = header;
|
||||
}
|
||||
|
||||
private void initializeReadTransformers() {
|
||||
this.onOutputReadTransformers = new ArrayList<ReadTransformer>(engine.getReadTransformers().size());
|
||||
for ( final ReadTransformer transformer : engine.getReadTransformers() ) {
|
||||
if ( transformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_OUTPUT )
|
||||
onOutputReadTransformers.add(transformer);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @{inheritDoc}
|
||||
*/
|
||||
public void addAlignment( SAMRecord alignment ) {
|
||||
if ( engine.getArguments().BAQMode != BAQ.CalculationMode.OFF && engine.getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_OUTPUT ) {
|
||||
//System.out.printf("Writing BAQ at OUTPUT TIME%n");
|
||||
baqHMM.baqRead(alignment, engine.getReferenceDataSource().getReference(), engine.getArguments().BAQMode, engine.getWalkerBAQQualityMode());
|
||||
}
|
||||
public void addAlignment( final SAMRecord readIn ) {
|
||||
if ( onOutputReadTransformers == null )
|
||||
initializeReadTransformers();
|
||||
|
||||
GATKSAMRecord workingRead = (GATKSAMRecord)readIn;
|
||||
|
||||
// run on output read transformers
|
||||
for ( final ReadTransformer transform : onOutputReadTransformers )
|
||||
workingRead = transform.apply(workingRead);
|
||||
|
||||
writeStarted = true;
|
||||
outputTracker.getStorage(this).addAlignment(alignment);
|
||||
outputTracker.getStorage(this).addAlignment(workingRead);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -32,9 +32,9 @@ import org.broadinstitute.sting.utils.classloader.JVMUtils;
|
|||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.Options;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;
|
||||
|
||||
import java.io.File;
|
||||
|
|
@ -269,7 +269,7 @@ public class VariantContextWriterStub implements Stub<VariantContextWriter>, Var
|
|||
* @return
|
||||
*/
|
||||
public boolean alsoWriteBCFForTest() {
|
||||
return engine.getArguments().numberOfThreads == 1 && // only works single threaded
|
||||
return engine.getArguments().numberOfDataThreads == 1 && // only works single threaded
|
||||
! isCompressed() && // for non-compressed outputs
|
||||
getFile() != null && // that are going to disk
|
||||
engine.getArguments().generateShadowBCF; // and we actually want to do it
|
||||
|
|
|
|||
|
|
@ -6,13 +6,13 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
|||
import java.util.Iterator;
|
||||
|
||||
|
||||
public class DownsampleIterator implements StingSAMIterator {
|
||||
public class LegacyDownsampleIterator implements StingSAMIterator {
|
||||
|
||||
StingSAMIterator it;
|
||||
int cutoff;
|
||||
SAMRecord next;
|
||||
|
||||
public DownsampleIterator(StingSAMIterator it, double fraction) {
|
||||
public LegacyDownsampleIterator(StingSAMIterator it, double fraction) {
|
||||
this.it = it;
|
||||
cutoff = (int)(fraction * 10000);
|
||||
next = getNextRecord();
|
||||
|
|
@ -31,8 +31,8 @@ import net.sf.samtools.CigarElement;
|
|||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.ReadProperties;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
|
@ -318,6 +318,7 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
continue;
|
||||
|
||||
if (op == CigarOperator.D) {
|
||||
// TODO -- LIBS is totally busted for deletions so that reads with Ds right before Is in their CIGAR are broken; must fix
|
||||
if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
|
||||
pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1));
|
||||
size++;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,649 @@
|
|||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.iterators;
|
||||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.ReadProperties;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.downsampling.Downsampler;
|
||||
import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Iterator that traverses a SAM File, accumulating information on a per-locus basis
|
||||
*/
|
||||
public class LocusIteratorByStateExperimental extends LocusIterator {
|
||||
/**
|
||||
* our log, which we want to capture anything from this class
|
||||
*/
|
||||
private static Logger logger = Logger.getLogger(LocusIteratorByState.class);
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// member fields
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Used to create new GenomeLocs.
|
||||
*/
|
||||
private final GenomeLocParser genomeLocParser;
|
||||
private final ArrayList<String> samples;
|
||||
private final ReadStateManager readStates;
|
||||
|
||||
protected static class SAMRecordState {
|
||||
SAMRecord read;
|
||||
int readOffset = -1; // how far are we offset from the start of the read bases?
|
||||
int genomeOffset = -1; // how far are we offset from the alignment start on the genome?
|
||||
|
||||
Cigar cigar = null;
|
||||
int cigarOffset = -1;
|
||||
CigarElement curElement = null;
|
||||
int nCigarElements = 0;
|
||||
|
||||
int cigarElementCounter = -1; // how far are we into a single cigarElement
|
||||
|
||||
// The logical model for generating extended events is as follows: the "record state" implements the traversal
|
||||
// along the reference; thus stepForwardOnGenome() returns on every and only on actual reference bases. This
|
||||
// can be a (mis)match or a deletion (in the latter case, we still return on every individual reference base the
|
||||
// deletion spans). In the extended events mode, the record state also remembers if there was an insertion, or
|
||||
// if the deletion just started *right before* the current reference base the record state is pointing to upon the return from
|
||||
// stepForwardOnGenome(). The next call to stepForwardOnGenome() will clear that memory (as we remember only extended
|
||||
// events immediately preceding the current reference base).
|
||||
|
||||
public SAMRecordState(SAMRecord read) {
|
||||
this.read = read;
|
||||
cigar = read.getCigar();
|
||||
nCigarElements = cigar.numCigarElements();
|
||||
|
||||
//System.out.printf("Creating a SAMRecordState: %s%n", this);
|
||||
}
|
||||
|
||||
public SAMRecord getRead() {
|
||||
return read;
|
||||
}
|
||||
|
||||
/**
|
||||
* What is our current offset in the read's bases that aligns us with the reference genome?
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public int getReadOffset() {
|
||||
return readOffset;
|
||||
}
|
||||
|
||||
/**
|
||||
* What is the current offset w.r.t. the alignment state that aligns us to the readOffset?
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public int getGenomeOffset() {
|
||||
return genomeOffset;
|
||||
}
|
||||
|
||||
public int getGenomePosition() {
|
||||
return read.getAlignmentStart() + getGenomeOffset();
|
||||
}
|
||||
|
||||
public GenomeLoc getLocation(GenomeLocParser genomeLocParser) {
|
||||
return genomeLocParser.createGenomeLoc(read.getReferenceName(), getGenomePosition());
|
||||
}
|
||||
|
||||
public CigarOperator getCurrentCigarOperator() {
|
||||
return curElement.getOperator();
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement);
|
||||
}
|
||||
|
||||
public CigarElement peekForwardOnGenome() {
|
||||
return ( cigarElementCounter + 1 > curElement.getLength() && cigarOffset + 1 < nCigarElements ? cigar.getCigarElement(cigarOffset + 1) : curElement );
|
||||
}
|
||||
|
||||
public CigarElement peekBackwardOnGenome() {
|
||||
return ( cigarElementCounter - 1 == 0 && cigarOffset - 1 > 0 ? cigar.getCigarElement(cigarOffset - 1) : curElement );
|
||||
}
|
||||
|
||||
|
||||
public CigarOperator stepForwardOnGenome() {
|
||||
// we enter this method with readOffset = index of the last processed base on the read
|
||||
// (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion
|
||||
|
||||
|
||||
if (curElement == null || ++cigarElementCounter > curElement.getLength()) {
|
||||
cigarOffset++;
|
||||
if (cigarOffset < nCigarElements) {
|
||||
curElement = cigar.getCigarElement(cigarOffset);
|
||||
cigarElementCounter = 0;
|
||||
// next line: guards against cigar elements of length 0; when new cigar element is retrieved,
|
||||
// we reenter in order to re-check cigarElementCounter against curElement's length
|
||||
return stepForwardOnGenome();
|
||||
} else {
|
||||
if (curElement != null && curElement.getOperator() == CigarOperator.D)
|
||||
throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");
|
||||
|
||||
// Reads that contain indels model the genomeOffset as the following base in the reference. Because
|
||||
// we fall into this else block only when indels end the read, increment genomeOffset such that the
|
||||
// current offset of this read is the next ref base after the end of the indel. This position will
|
||||
// model a point on the reference somewhere after the end of the read.
|
||||
genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here:
|
||||
// we do step forward on the ref, and by returning null we also indicate that we are past the read end.
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
boolean done = false;
|
||||
switch (curElement.getOperator()) {
|
||||
case H: // ignore hard clips
|
||||
case P: // ignore pads
|
||||
cigarElementCounter = curElement.getLength();
|
||||
break;
|
||||
case I: // insertion w.r.t. the reference
|
||||
case S: // soft clip
|
||||
cigarElementCounter = curElement.getLength();
|
||||
readOffset += curElement.getLength();
|
||||
break;
|
||||
case D: // deletion w.r.t. the reference
|
||||
if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string
|
||||
throw new UserException.MalformedBAM(read, "read starts with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");
|
||||
// should be the same as N case
|
||||
genomeOffset++;
|
||||
done = true;
|
||||
break;
|
||||
case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
|
||||
genomeOffset++;
|
||||
done = true;
|
||||
break;
|
||||
case M:
|
||||
case EQ:
|
||||
case X:
|
||||
readOffset++;
|
||||
genomeOffset++;
|
||||
done = true;
|
||||
break;
|
||||
default:
|
||||
throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator());
|
||||
}
|
||||
|
||||
return done ? curElement.getOperator() : stepForwardOnGenome();
|
||||
}
|
||||
}
|
||||
|
||||
//final boolean DEBUG = false;
|
||||
//final boolean DEBUG2 = false && DEBUG;
|
||||
private ReadProperties readInfo;
|
||||
private AlignmentContext nextAlignmentContext;
|
||||
private boolean performLevelingDownsampling;
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// constructors and other basic operations
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public LocusIteratorByStateExperimental(final Iterator<SAMRecord> samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection<String> samples) {
|
||||
this.readInfo = readInformation;
|
||||
this.genomeLocParser = genomeLocParser;
|
||||
this.samples = new ArrayList<String>(samples);
|
||||
this.readStates = new ReadStateManager(samIterator);
|
||||
|
||||
this.performLevelingDownsampling = readInfo.getDownsamplingMethod() != null &&
|
||||
readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE &&
|
||||
readInfo.getDownsamplingMethod().toCoverage != null;
|
||||
|
||||
// currently the GATK expects this LocusIteratorByState to accept empty sample lists, when
|
||||
// there's no read data. So we need to throw this error only when samIterator.hasNext() is true
|
||||
if (this.samples.isEmpty() && samIterator.hasNext()) {
|
||||
throw new IllegalArgumentException("samples list must not be empty");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* For testing only. Assumes that the incoming SAMRecords have no read groups, so creates a dummy sample list
|
||||
* for the system.
|
||||
*/
|
||||
public final static Collection<String> sampleListForSAMWithoutReadGroups() {
|
||||
List<String> samples = new ArrayList<String>();
|
||||
samples.add(null);
|
||||
return samples;
|
||||
}
|
||||
|
||||
public Iterator<AlignmentContext> iterator() {
|
||||
return this;
|
||||
}
|
||||
|
||||
public void close() {
|
||||
//this.it.close();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
lazyLoadNextAlignmentContext();
|
||||
return (nextAlignmentContext != null);
|
||||
//if ( DEBUG ) System.out.printf("hasNext() = %b%n", r);
|
||||
}
|
||||
|
||||
private GenomeLoc getLocation() {
|
||||
return readStates.isEmpty() ? null : readStates.getFirst().getLocation(genomeLocParser);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// next() routine and associated collection operations
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
public AlignmentContext next() {
|
||||
lazyLoadNextAlignmentContext();
|
||||
if (!hasNext())
|
||||
throw new NoSuchElementException("LocusIteratorByState: out of elements.");
|
||||
AlignmentContext currentAlignmentContext = nextAlignmentContext;
|
||||
nextAlignmentContext = null;
|
||||
return currentAlignmentContext;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates the next alignment context from the given state. Note that this is implemented as a lazy load method.
|
||||
* nextAlignmentContext MUST BE null in order for this method to advance to the next entry.
|
||||
*/
|
||||
private void lazyLoadNextAlignmentContext() {
|
||||
while (nextAlignmentContext == null && readStates.hasNext()) {
|
||||
readStates.collectPendingReads();
|
||||
|
||||
final GenomeLoc location = getLocation();
|
||||
final Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>();
|
||||
|
||||
// TODO: How can you determine here whether the current pileup has been downsampled?
|
||||
boolean hasBeenSampled = false;
|
||||
|
||||
for (final String sample : samples) {
|
||||
final Iterator<SAMRecordState> iterator = readStates.iterator(sample);
|
||||
final List<PileupElement> pile = new ArrayList<PileupElement>(readStates.size(sample));
|
||||
|
||||
int size = 0; // number of elements in this sample's pileup
|
||||
int nDeletions = 0; // number of deletions in this sample's pileup
|
||||
int nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0)
|
||||
|
||||
while (iterator.hasNext()) {
|
||||
final SAMRecordState state = iterator.next(); // state object with the read/offset information
|
||||
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
|
||||
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
|
||||
final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element
|
||||
final CigarElement lastElement = state.peekBackwardOnGenome(); // last cigar element
|
||||
final boolean isSingleElementCigar = nextElement == lastElement;
|
||||
final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator
|
||||
final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator
|
||||
int readOffset = state.getReadOffset(); // the base offset on this read
|
||||
|
||||
final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION;
|
||||
final boolean isAfterDeletion = lastOp == CigarOperator.DELETION;
|
||||
final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION;
|
||||
final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION && !isSingleElementCigar;
|
||||
final boolean isNextToSoftClip = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart());
|
||||
|
||||
int nextElementLength = nextElement.getLength();
|
||||
|
||||
if (op == CigarOperator.N) // N's are never added to any pileup
|
||||
continue;
|
||||
|
||||
if (op == CigarOperator.D) {
|
||||
// TODO -- LIBS is totally busted for deletions so that reads with Ds right before Is in their CIGAR are broken; must fix
|
||||
if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
|
||||
pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1));
|
||||
size++;
|
||||
nDeletions++;
|
||||
if (read.getMappingQuality() == 0)
|
||||
nMQ0Reads++;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (!filterBaseInRead(read, location.getStart())) {
|
||||
String insertedBaseString = null;
|
||||
if (nextOp == CigarOperator.I) {
|
||||
final int insertionOffset = isSingleElementCigar ? 0 : 1;
|
||||
// TODO -- someone please implement a better fix for the single element insertion CIGAR!
|
||||
if (isSingleElementCigar)
|
||||
readOffset -= (nextElement.getLength() - 1); // LIBS has passed over the insertion bases!
|
||||
insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + insertionOffset, readOffset + insertionOffset + nextElement.getLength()));
|
||||
}
|
||||
|
||||
pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength));
|
||||
size++;
|
||||
if (read.getMappingQuality() == 0)
|
||||
nMQ0Reads++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (pile.size() != 0) // if this pileup added at least one base, add it to the full pileup
|
||||
fullPileup.put(sample, new ReadBackedPileupImpl(location, pile, size, nDeletions, nMQ0Reads));
|
||||
}
|
||||
|
||||
updateReadStates(); // critical - must be called after we get the current state offsets and location
|
||||
if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done
|
||||
nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled);
|
||||
}
|
||||
}
|
||||
|
||||
// fast testing of position
|
||||
private boolean readIsPastCurrentPosition(SAMRecord read) {
|
||||
if (readStates.isEmpty())
|
||||
return false;
|
||||
else {
|
||||
SAMRecordState state = readStates.getFirst();
|
||||
SAMRecord ourRead = state.getRead();
|
||||
return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() > state.getGenomePosition();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generic place to put per-base filters appropriate to LocusIteratorByState
|
||||
*
|
||||
* @param rec
|
||||
* @param pos
|
||||
* @return
|
||||
*/
|
||||
private static boolean filterBaseInRead(GATKSAMRecord rec, long pos) {
|
||||
return ReadUtils.isBaseInsideAdaptor(rec, pos);
|
||||
}
|
||||
|
||||
private void updateReadStates() {
|
||||
for (final String sample : samples) {
|
||||
Iterator<SAMRecordState> it = readStates.iterator(sample);
|
||||
while (it.hasNext()) {
|
||||
SAMRecordState state = it.next();
|
||||
CigarOperator op = state.stepForwardOnGenome();
|
||||
if (op == null) {
|
||||
// we discard the read only when we are past its end AND indel at the end of the read (if any) was
|
||||
// already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe
|
||||
// as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag.
|
||||
it.remove(); // we've stepped off the end of the object
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
|
||||
}
|
||||
|
||||
protected class ReadStateManager {
|
||||
private final PeekableIterator<SAMRecord> iterator;
|
||||
private final SamplePartitioner samplePartitioner;
|
||||
private final Map<String, PerSampleReadStateManager> readStatesBySample = new HashMap<String, PerSampleReadStateManager>();
|
||||
private int totalReadStates = 0;
|
||||
|
||||
public ReadStateManager(Iterator<SAMRecord> source) {
|
||||
this.iterator = new PeekableIterator<SAMRecord>(source);
|
||||
|
||||
for (final String sample : samples) {
|
||||
readStatesBySample.put(sample, new PerSampleReadStateManager());
|
||||
}
|
||||
|
||||
samplePartitioner = new SamplePartitioner();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a iterator over all the reads associated with the given sample. Note that remove() is implemented
|
||||
* for this iterator; if present, total read states will be decremented.
|
||||
*
|
||||
* @param sample The sample.
|
||||
* @return Iterator over the reads associated with that sample.
|
||||
*/
|
||||
public Iterator<SAMRecordState> iterator(final String sample) {
|
||||
return new Iterator<SAMRecordState>() {
|
||||
private Iterator<SAMRecordState> wrappedIterator = readStatesBySample.get(sample).iterator();
|
||||
|
||||
public boolean hasNext() {
|
||||
return wrappedIterator.hasNext();
|
||||
}
|
||||
|
||||
public SAMRecordState next() {
|
||||
return wrappedIterator.next();
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
wrappedIterator.remove();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return totalReadStates == 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves the total number of reads in the manager across all samples.
|
||||
*
|
||||
* @return Total number of reads over all samples.
|
||||
*/
|
||||
public int size() {
|
||||
return totalReadStates;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves the total number of reads in the manager in the given sample.
|
||||
*
|
||||
* @param sample The sample.
|
||||
* @return Total number of reads in the given sample.
|
||||
*/
|
||||
public int size(final String sample) {
|
||||
return readStatesBySample.get(sample).size();
|
||||
}
|
||||
|
||||
public SAMRecordState getFirst() {
|
||||
for (final String sample : samples) {
|
||||
PerSampleReadStateManager reads = readStatesBySample.get(sample);
|
||||
if (!reads.isEmpty())
|
||||
return reads.peek();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return totalReadStates > 0 || iterator.hasNext();
|
||||
}
|
||||
|
||||
public void collectPendingReads() {
|
||||
if (!iterator.hasNext())
|
||||
return;
|
||||
|
||||
if (readStates.size() == 0) {
|
||||
int firstContigIndex = iterator.peek().getReferenceIndex();
|
||||
int firstAlignmentStart = iterator.peek().getAlignmentStart();
|
||||
while (iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) {
|
||||
samplePartitioner.submitRead(iterator.next());
|
||||
}
|
||||
} else {
|
||||
// Fast fail in the case that the read is past the current position.
|
||||
if (readIsPastCurrentPosition(iterator.peek()))
|
||||
return;
|
||||
|
||||
while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) {
|
||||
samplePartitioner.submitRead(iterator.next());
|
||||
}
|
||||
}
|
||||
|
||||
for (final String sample : samples) {
|
||||
Collection<SAMRecord> newReads = samplePartitioner.getReadsForSample(sample);
|
||||
PerSampleReadStateManager statesBySample = readStatesBySample.get(sample);
|
||||
addReadsToSample(statesBySample, newReads);
|
||||
}
|
||||
|
||||
samplePartitioner.reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* Add reads with the given sample name to the given hanger entry.
|
||||
*
|
||||
* @param readStates The list of read states to add this collection of reads.
|
||||
* @param reads Reads to add. Selected reads will be pulled from this source.
|
||||
*/
|
||||
private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection<SAMRecord> reads) {
|
||||
if (reads.isEmpty())
|
||||
return;
|
||||
|
||||
Collection<SAMRecordState> newReadStates = new LinkedList<SAMRecordState>();
|
||||
|
||||
for (SAMRecord read : reads) {
|
||||
SAMRecordState state = new SAMRecordState(read);
|
||||
state.stepForwardOnGenome();
|
||||
newReadStates.add(state);
|
||||
}
|
||||
|
||||
readStates.addStatesAtNextAlignmentStart(newReadStates);
|
||||
}
|
||||
|
||||
protected class PerSampleReadStateManager implements Iterable<SAMRecordState> {
|
||||
private List<LinkedList<SAMRecordState>> readStatesByAlignmentStart = new LinkedList<LinkedList<SAMRecordState>>();
|
||||
private int thisSampleReadStates = 0;
|
||||
private Downsampler<LinkedList<SAMRecordState>> levelingDownsampler =
|
||||
performLevelingDownsampling ?
|
||||
new LevelingDownsampler<LinkedList<SAMRecordState>, SAMRecordState>(readInfo.getDownsamplingMethod().toCoverage) :
|
||||
null;
|
||||
|
||||
public void addStatesAtNextAlignmentStart(Collection<SAMRecordState> states) {
|
||||
if ( states.isEmpty() ) {
|
||||
return;
|
||||
}
|
||||
|
||||
readStatesByAlignmentStart.add(new LinkedList<SAMRecordState>(states));
|
||||
thisSampleReadStates += states.size();
|
||||
totalReadStates += states.size();
|
||||
|
||||
if ( levelingDownsampler != null ) {
|
||||
levelingDownsampler.submit(readStatesByAlignmentStart);
|
||||
levelingDownsampler.signalEndOfInput();
|
||||
|
||||
thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
|
||||
totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
|
||||
|
||||
// use returned List directly rather than make a copy, for efficiency's sake
|
||||
readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems();
|
||||
levelingDownsampler.reset();
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return readStatesByAlignmentStart.isEmpty();
|
||||
}
|
||||
|
||||
public SAMRecordState peek() {
|
||||
return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek();
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return thisSampleReadStates;
|
||||
}
|
||||
|
||||
public Iterator<SAMRecordState> iterator() {
|
||||
return new Iterator<SAMRecordState>() {
|
||||
private Iterator<LinkedList<SAMRecordState>> alignmentStartIterator = readStatesByAlignmentStart.iterator();
|
||||
private LinkedList<SAMRecordState> currentPositionReadStates = null;
|
||||
private Iterator<SAMRecordState> currentPositionReadStatesIterator = null;
|
||||
|
||||
public boolean hasNext() {
|
||||
return alignmentStartIterator.hasNext() ||
|
||||
(currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext());
|
||||
}
|
||||
|
||||
public SAMRecordState next() {
|
||||
if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) {
|
||||
currentPositionReadStates = alignmentStartIterator.next();
|
||||
currentPositionReadStatesIterator = currentPositionReadStates.iterator();
|
||||
}
|
||||
|
||||
return currentPositionReadStatesIterator.next();
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
currentPositionReadStatesIterator.remove();
|
||||
thisSampleReadStates--;
|
||||
totalReadStates--;
|
||||
|
||||
if ( currentPositionReadStates.isEmpty() ) {
|
||||
alignmentStartIterator.remove();
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Note: stores reads by sample ID string, not by sample object
|
||||
*/
|
||||
private class SamplePartitioner {
|
||||
private Map<String, Collection<SAMRecord>> readsBySample;
|
||||
private long readsSeen = 0;
|
||||
|
||||
public SamplePartitioner() {
|
||||
readsBySample = new HashMap<String, Collection<SAMRecord>>();
|
||||
|
||||
for ( String sample : samples ) {
|
||||
readsBySample.put(sample, new ArrayList<SAMRecord>());
|
||||
}
|
||||
}
|
||||
|
||||
public void submitRead(SAMRecord read) {
|
||||
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
|
||||
if (readsBySample.containsKey(sampleName))
|
||||
readsBySample.get(sampleName).add(read);
|
||||
readsSeen++;
|
||||
}
|
||||
|
||||
public long getNumReadsSeen() {
|
||||
return readsSeen;
|
||||
}
|
||||
|
||||
public Collection<SAMRecord> getReadsForSample(String sampleName) {
|
||||
if ( ! readsBySample.containsKey(sampleName) )
|
||||
throw new NoSuchElementException("Sample name not found");
|
||||
return readsBySample.get(sampleName);
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
for ( Collection<SAMRecord> perSampleReads : readsBySample.values() )
|
||||
perSampleReads.clear();
|
||||
readsSeen = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,144 @@
|
|||
package org.broadinstitute.sting.gatk.iterators;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/**
|
||||
* Baseclass used to describe a read transformer like BAQ and BQSR
|
||||
*
|
||||
* Read transformers are plugable infrastructure that modify read state
|
||||
* either on input, on output, or within walkers themselves.
|
||||
*
|
||||
* The function apply() is called on each read seen by the GATK (after passing
|
||||
* all ReadFilters) and it can do as it sees fit (without modifying the alignment)
|
||||
* to the read to change qualities, add tags, etc.
|
||||
*
|
||||
* Initialize is called once right before the GATK traversal begins providing
|
||||
* the ReadTransformer with the ability to collect and initialize data from the
|
||||
* engine.
|
||||
*
|
||||
* Note that all ReadTransformers within the classpath are created and initialized. If one
|
||||
* shouldn't be run it should look at the command line options of the engine and override
|
||||
* the enabled.
|
||||
*
|
||||
* @since 8/31/12
|
||||
* @author depristo
|
||||
*/
|
||||
abstract public class ReadTransformer {
|
||||
/**
|
||||
* When should this read transform be applied?
|
||||
*/
|
||||
private ApplicationTime applicationTime;
|
||||
|
||||
/**
|
||||
* Keep track of whether we've been initialized already, and ensure it's not called more than once.
|
||||
*/
|
||||
private boolean initialized = false;
|
||||
|
||||
protected ReadTransformer() {}
|
||||
|
||||
/**
|
||||
* Master initialization routine. Called to setup a ReadTransform, using it's overloaded initialialSub routine.
|
||||
*
|
||||
* @param overrideTime if not null, we will run this ReadTransform at the time provided, regardless of the timing of this read transformer itself
|
||||
* @param engine the engine, for initializing values
|
||||
* @param walker the walker we intend to run
|
||||
*/
|
||||
@Requires({"initialized == false", "engine != null", "walker != null"})
|
||||
@Ensures("initialized == true")
|
||||
public final void initialize(final ApplicationTime overrideTime, final GenomeAnalysisEngine engine, final Walker walker) {
|
||||
if ( engine == null ) throw new IllegalArgumentException("engine cannot be null");
|
||||
if ( walker == null ) throw new IllegalArgumentException("walker cannot be null");
|
||||
|
||||
this.applicationTime = initializeSub(engine, walker);
|
||||
if ( overrideTime != null ) this.applicationTime = overrideTime;
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Subclasses must override this to initialize themeselves
|
||||
*
|
||||
* @param engine the engine, for initializing values
|
||||
* @param walker the walker we intend to run
|
||||
* @return the point of time we'd like this read transform to be run
|
||||
*/
|
||||
@Requires({"engine != null", "walker != null"})
|
||||
@Ensures("result != null")
|
||||
protected abstract ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker);
|
||||
|
||||
/**
|
||||
* Should this ReadTransformer be activated? Called after initialize, which allows this
|
||||
* read transformer to look at its arguments and decide if it should be active. All
|
||||
* ReadTransformers must override this, as by default they are not enabled.
|
||||
*
|
||||
* @return true if this ReadTransformer should be used on the read stream
|
||||
*/
|
||||
public boolean enabled() {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Has this transformer been initialized?
|
||||
*
|
||||
* @return true if it has
|
||||
*/
|
||||
public final boolean isInitialized() {
|
||||
return initialized;
|
||||
}
|
||||
|
||||
/**
|
||||
* When should we apply this read transformer?
|
||||
*
|
||||
* @return true if yes
|
||||
*/
|
||||
public final ApplicationTime getApplicationTime() {
|
||||
return applicationTime;
|
||||
}
|
||||
|
||||
/**
|
||||
* Primary interface function for a read transform to actually do some work
|
||||
*
|
||||
* The function apply() is called on each read seen by the GATK (after passing
|
||||
* all ReadFilters) and it can do as it sees fit (without modifying the alignment)
|
||||
* to the read to change qualities, add tags, etc.
|
||||
*
|
||||
* @param read the read to transform
|
||||
* @return the transformed read
|
||||
*/
|
||||
@Requires("read != null")
|
||||
@Ensures("result != null")
|
||||
abstract public GATKSAMRecord apply(final GATKSAMRecord read);
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getClass().getSimpleName();
|
||||
}
|
||||
|
||||
/**
|
||||
* When should a read transformer be applied?
|
||||
*/
|
||||
public static enum ApplicationTime {
|
||||
/**
|
||||
* Walker does not tolerate this read transformer
|
||||
*/
|
||||
FORBIDDEN,
|
||||
|
||||
/**
|
||||
* apply the transformation to the incoming reads, the default
|
||||
*/
|
||||
ON_INPUT,
|
||||
|
||||
/**
|
||||
* apply the transformation to the outgoing read stream
|
||||
*/
|
||||
ON_OUTPUT,
|
||||
|
||||
/**
|
||||
* the walker will deal with the calculation itself
|
||||
*/
|
||||
HANDLED_IN_WALKER
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
package org.broadinstitute.sting.gatk.iterators;
|
||||
|
||||
import java.lang.annotation.*;
|
||||
|
||||
/**
|
||||
* User: hanna
|
||||
* Date: May 14, 2009
|
||||
* Time: 1:51:22 PM
|
||||
* BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT
|
||||
* Software and documentation are copyright 2005 by the Broad Institute.
|
||||
* All rights are reserved.
|
||||
*
|
||||
* Users acknowledge that this software is supplied without any warranty or support.
|
||||
* The Broad Institute is not responsible for its use, misuse, or
|
||||
* functionality.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Allows the walker to indicate what type of data it wants to consume.
|
||||
*/
|
||||
|
||||
@Documented
|
||||
@Inherited
|
||||
@Retention(RetentionPolicy.RUNTIME)
|
||||
@Target(ElementType.TYPE)
|
||||
public @interface ReadTransformersMode {
|
||||
public abstract ReadTransformer.ApplicationTime ApplicationTime() default ReadTransformer.ApplicationTime.ON_INPUT;
|
||||
}
|
||||
|
|
@ -1,7 +1,6 @@
|
|||
package org.broadinstitute.sting.gatk.iterators;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
|
|
@ -11,13 +10,11 @@ import java.util.Iterator;
|
|||
* Verifies that the incoming stream of reads is correctly sorted
|
||||
*/
|
||||
public class VerifyingSamIterator implements StingSAMIterator {
|
||||
private GenomeLocParser genomeLocParser;
|
||||
StingSAMIterator it;
|
||||
SAMRecord last = null;
|
||||
boolean checkOrderP = true;
|
||||
|
||||
public VerifyingSamIterator(GenomeLocParser genomeLocParser,StingSAMIterator it) {
|
||||
this.genomeLocParser = genomeLocParser;
|
||||
public VerifyingSamIterator(StingSAMIterator it) {
|
||||
this.it = it;
|
||||
}
|
||||
|
||||
|
|
@ -48,9 +45,9 @@ public class VerifyingSamIterator implements StingSAMIterator {
|
|||
if(cur.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX || cur.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START)
|
||||
throw new UserException.MalformedBAM(last,String.format("read %s has inconsistent mapping information.",cur.format()));
|
||||
|
||||
GenomeLoc lastLoc = genomeLocParser.createGenomeLoc( last );
|
||||
GenomeLoc curLoc = genomeLocParser.createGenomeLoc( cur );
|
||||
return curLoc.compareTo(lastLoc) == -1;
|
||||
return (last.getReferenceIndex() > cur.getReferenceIndex()) ||
|
||||
(last.getReferenceIndex().equals(cur.getReferenceIndex()) &&
|
||||
last.getAlignmentStart() > cur.getAlignmentStart());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -32,6 +32,7 @@ import org.broadinstitute.sting.gatk.walkers.Walker;
|
|||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor;
|
||||
import org.jets3t.service.S3Service;
|
||||
import org.jets3t.service.S3ServiceException;
|
||||
import org.jets3t.service.impl.rest.httpclient.RestS3Service;
|
||||
|
|
@ -141,6 +142,21 @@ public class GATKRunReport {
|
|||
@Element(required = true, name = "tag")
|
||||
private String tag;
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
// elements related to multi-threading and efficiency
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
@Element(required = true, name = "numThreads")
|
||||
private int numThreads;
|
||||
@Element(required = true, name = "percent_time_running")
|
||||
private String percentTimeRunning;
|
||||
@Element(required = true, name = "percent_time_waiting")
|
||||
private String percentTimeWaiting;
|
||||
@Element(required = true, name = "percent_time_blocking")
|
||||
private String percentTimeBlocking;
|
||||
@Element(required = true, name = "percent_time_waiting_for_io")
|
||||
private String percentTimeWaitingForIO;
|
||||
|
||||
public enum PhoneHomeOption {
|
||||
/** Disable phone home */
|
||||
NO_ET,
|
||||
|
|
@ -201,12 +217,30 @@ public class GATKRunReport {
|
|||
|
||||
// if there was an exception, capture it
|
||||
this.mException = e == null ? null : new ExceptionToXML(e);
|
||||
|
||||
numThreads = engine.getTotalNumberOfThreads();
|
||||
percentTimeRunning = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.USER_CPU);
|
||||
percentTimeBlocking = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.BLOCKING);
|
||||
percentTimeWaiting = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.WAITING);
|
||||
percentTimeWaitingForIO = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.WAITING_FOR_IO);
|
||||
}
|
||||
|
||||
public String getID() {
|
||||
return id;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a string representing the percent of time the GATK spent in state, if possible. Otherwise return NA
|
||||
*
|
||||
* @param engine the GATK engine whose threading efficiency info we will use
|
||||
* @param state the state whose occupancy we wish to know
|
||||
* @return a string representation of the percent occupancy of state, or NA is not possible
|
||||
*/
|
||||
private String getThreadEfficiencyPercent(final GenomeAnalysisEngine engine, final ThreadEfficiencyMonitor.State state) {
|
||||
final ThreadEfficiencyMonitor tem = engine.getThreadEfficiencyMonitor();
|
||||
return tem == null ? "NA" : String.format("%.2f", tem.getStatePercent(state));
|
||||
}
|
||||
|
||||
|
||||
public void postReport(PhoneHomeOption type) {
|
||||
logger.debug("Posting report of type " + type);
|
||||
|
|
|
|||
|
|
@ -1,179 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010. The Broad Institute
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.RODMetaDataContainer;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Class ReadMetaDataTracker
|
||||
* <p/>
|
||||
* a read-based meta data tracker
|
||||
*/
|
||||
public class ReadMetaDataTracker {
|
||||
/**
|
||||
* The parser, used to create new GenomeLocs.
|
||||
*/
|
||||
private final GenomeLocParser genomeLocParser;
|
||||
|
||||
private final SAMRecord record;
|
||||
|
||||
// the buffer of positions and RODs we've stored
|
||||
private final TreeMap<Integer, RODMetaDataContainer> mapping;
|
||||
|
||||
/**
|
||||
* create a read meta data tracker, given the read and a queue of RODatum positions
|
||||
*
|
||||
* @param record the read to create offset from
|
||||
* @param mapping the mapping of reference ordered datum
|
||||
*/
|
||||
public ReadMetaDataTracker(GenomeLocParser genomeLocParser, SAMRecord record, TreeMap<Integer, RODMetaDataContainer> mapping) {
|
||||
this.genomeLocParser = genomeLocParser;
|
||||
this.record = record;
|
||||
this.mapping = mapping;
|
||||
}
|
||||
|
||||
/**
|
||||
* create an alignment of read position to reference ordered datum
|
||||
*
|
||||
* @param record the SAMRecord
|
||||
* @param queue the queue (as a tree set)
|
||||
* @param cl the class name, null if not filtered by classname
|
||||
* @param name the datum track name, null if not filtered by name
|
||||
*
|
||||
* @return a mapping from the position in the read to the reference ordered datum
|
||||
*/
|
||||
private Map<Integer, Collection<GATKFeature>> createReadAlignment(SAMRecord record, TreeMap<Integer, RODMetaDataContainer> queue, Class cl, String name) {
|
||||
if (name != null && cl != null) throw new IllegalStateException("Both a class and name cannot be specified");
|
||||
Map<Integer, Collection<GATKFeature>> ret = new LinkedHashMap<Integer, Collection<GATKFeature>>();
|
||||
GenomeLoc location = genomeLocParser.createGenomeLoc(record);
|
||||
int length = record.getReadLength();
|
||||
for (Integer loc : queue.keySet()) {
|
||||
Integer position = loc - location.getStart();
|
||||
if (position >= 0 && position < length) {
|
||||
Collection<GATKFeature> set;
|
||||
if (cl != null)
|
||||
set = queue.get(loc).getSet(cl);
|
||||
else
|
||||
set = queue.get(loc).getSet(name);
|
||||
if (set != null && set.size() > 0)
|
||||
ret.put(position, set);
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* create an alignment of read position to reference ordered datum
|
||||
*
|
||||
* @return a mapping from the position in the read to the reference ordered datum
|
||||
*/
|
||||
private Map<Integer, Collection<GATKFeature>> createGenomeLocAlignment(SAMRecord record, TreeMap<Integer, RODMetaDataContainer> mapping, Class cl, String name) {
|
||||
Map<Integer, Collection<GATKFeature>> ret = new LinkedHashMap<Integer, Collection<GATKFeature>>();
|
||||
int start = record.getAlignmentStart();
|
||||
int stop = record.getAlignmentEnd();
|
||||
for (Integer location : mapping.keySet()) {
|
||||
if (location >= start && location <= stop)
|
||||
if (cl != null)
|
||||
ret.put(location, mapping.get(location).getSet(cl));
|
||||
else
|
||||
ret.put(location, mapping.get(location).getSet(name));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the position mapping, from read offset to ROD
|
||||
*
|
||||
* @return a mapping of read offset to ROD(s)
|
||||
*/
|
||||
public Map<Integer, Collection<GATKFeature>> getReadOffsetMapping() {
|
||||
return createReadAlignment(record, mapping, null, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* get the position mapping, from read offset to ROD
|
||||
*
|
||||
* @return a mapping of genome loc position to ROD(s)
|
||||
*/
|
||||
public Map<Integer, Collection<GATKFeature>> getContigOffsetMapping() {
|
||||
return createGenomeLocAlignment(record, mapping, null, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* get the position mapping, from read offset to ROD
|
||||
*
|
||||
* @return a mapping of read offset to ROD(s)
|
||||
*/
|
||||
public Map<Integer, Collection<GATKFeature>> getReadOffsetMapping(String name) {
|
||||
return createReadAlignment(record, mapping, null, name);
|
||||
}
|
||||
|
||||
/**
|
||||
* get the position mapping, from read offset to ROD
|
||||
*
|
||||
* @return a mapping of genome loc position to ROD(s)
|
||||
*/
|
||||
public Map<Integer, Collection<GATKFeature>> getContigOffsetMapping(String name) {
|
||||
return createGenomeLocAlignment(record, mapping, null, name);
|
||||
}
|
||||
|
||||
/**
|
||||
* get the position mapping, from read offset to ROD
|
||||
*
|
||||
* @return a mapping of read offset to ROD(s)
|
||||
*/
|
||||
public Map<Integer, Collection<GATKFeature>> getReadOffsetMapping(Class cl) {
|
||||
return createReadAlignment(record, mapping, cl, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* get the position mapping, from read offset to ROD
|
||||
*
|
||||
* @return a mapping of genome loc position to ROD(s)
|
||||
*/
|
||||
public Map<Integer, Collection<GATKFeature>> getContigOffsetMapping(Class cl) {
|
||||
return createGenomeLocAlignment(record, mapping, cl, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* get the list of all the RODS overlapping this read, without any information about their position
|
||||
* @return a Collection (no order guaranteed), of all the RODs covering this read
|
||||
*/
|
||||
public List<GATKFeature> getAllCoveringRods() {
|
||||
List<GATKFeature> ret = new ArrayList<GATKFeature>();
|
||||
for (Map.Entry<Integer, RODMetaDataContainer> entry : mapping.entrySet())
|
||||
ret.addAll(entry.getValue().getSet());
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
|
@ -5,7 +5,6 @@ import com.google.java.contract.Requires;
|
|||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
|
@ -32,11 +31,10 @@ import java.util.*;
|
|||
* Time: 3:05:23 PM
|
||||
*/
|
||||
public class RefMetaDataTracker {
|
||||
// TODO: this should be a list, not a map, actually
|
||||
// TODO: this should be a list, not a bindings, actually
|
||||
private final static RODRecordList EMPTY_ROD_RECORD_LIST = new RODRecordListImpl("EMPTY");
|
||||
|
||||
final Map<String, RODRecordList> map;
|
||||
final ReferenceContext ref;
|
||||
final Map<String, RODRecordList> bindings;
|
||||
final protected static Logger logger = Logger.getLogger(RefMetaDataTracker.class);
|
||||
|
||||
// ------------------------------------------------------------------------------------------
|
||||
|
|
@ -48,28 +46,25 @@ public class RefMetaDataTracker {
|
|||
// ------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Only for testing -- not accesssible in any other context
|
||||
* Create an tracker with no bindings
|
||||
*/
|
||||
public RefMetaDataTracker() {
|
||||
ref = null;
|
||||
map = Collections.emptyMap();
|
||||
bindings = Collections.emptyMap();
|
||||
}
|
||||
|
||||
public RefMetaDataTracker(final Collection<RODRecordList> allBindings, final ReferenceContext ref) {
|
||||
this.ref = ref;
|
||||
|
||||
// set up the map
|
||||
public RefMetaDataTracker(final Collection<RODRecordList> allBindings) {
|
||||
// set up the bindings
|
||||
if ( allBindings.isEmpty() )
|
||||
map = Collections.emptyMap();
|
||||
bindings = Collections.emptyMap();
|
||||
else {
|
||||
Map<String, RODRecordList> tmap = new HashMap<String, RODRecordList>(allBindings.size());
|
||||
final Map<String, RODRecordList> tmap = new HashMap<String, RODRecordList>(allBindings.size());
|
||||
for ( RODRecordList rod : allBindings ) {
|
||||
if ( rod != null && ! rod.isEmpty() )
|
||||
tmap.put(canonicalName(rod.getName()), rod);
|
||||
}
|
||||
|
||||
// ensure that no one modifies the map itself
|
||||
map = Collections.unmodifiableMap(tmap);
|
||||
// ensure that no one modifies the bindings itself
|
||||
bindings = Collections.unmodifiableMap(tmap);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -99,7 +94,7 @@ public class RefMetaDataTracker {
|
|||
@Requires({"type != null"})
|
||||
@Ensures("result != null")
|
||||
public <T extends Feature> List<T> getValues(final Class<T> type) {
|
||||
return addValues(map.keySet(), type, new ArrayList<T>(), null, false, false);
|
||||
return addValues(bindings.keySet(), type, new ArrayList<T>(), null, false, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -114,7 +109,7 @@ public class RefMetaDataTracker {
|
|||
@Requires({"type != null", "onlyAtThisLoc != null"})
|
||||
@Ensures("result != null")
|
||||
public <T extends Feature> List<T> getValues(final Class<T> type, final GenomeLoc onlyAtThisLoc) {
|
||||
return addValues(map.keySet(), type, new ArrayList<T>(), onlyAtThisLoc, true, false);
|
||||
return addValues(bindings.keySet(), type, new ArrayList<T>(), onlyAtThisLoc, true, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -296,7 +291,7 @@ public class RefMetaDataTracker {
|
|||
*/
|
||||
@Requires({"rodBinding != null"})
|
||||
public boolean hasValues(final RodBinding rodBinding) {
|
||||
return map.containsKey(canonicalName(rodBinding.getName()));
|
||||
return bindings.containsKey(canonicalName(rodBinding.getName()));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -306,7 +301,7 @@ public class RefMetaDataTracker {
|
|||
* @return List of all tracks
|
||||
*/
|
||||
public List<RODRecordList> getBoundRodTracks() {
|
||||
return new ArrayList<RODRecordList>(map.values());
|
||||
return new ArrayList<RODRecordList>(bindings.values());
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -314,38 +309,30 @@ public class RefMetaDataTracker {
|
|||
* @return the number of tracks with at least one bound Feature
|
||||
*/
|
||||
public int getNTracksWithBoundFeatures() {
|
||||
return map.size();
|
||||
return bindings.size();
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
// old style accessors
|
||||
//
|
||||
// TODO -- DELETE ME
|
||||
//
|
||||
//
|
||||
// Protected accessors using strings for unit testing
|
||||
// ------------------------------------------------------------------------------------------
|
||||
|
||||
@Deprecated
|
||||
public boolean hasValues(final String name) {
|
||||
return map.containsKey(canonicalName(name));
|
||||
protected boolean hasValues(final String name) {
|
||||
return bindings.containsKey(canonicalName(name));
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public <T extends Feature> List<T> getValues(final Class<T> type, final String name) {
|
||||
protected <T extends Feature> List<T> getValues(final Class<T> type, final String name) {
|
||||
return addValues(name, type, new ArrayList<T>(), getTrackDataByName(name), null, false, false);
|
||||
}
|
||||
@Deprecated
|
||||
public <T extends Feature> List<T> getValues(final Class<T> type, final String name, final GenomeLoc onlyAtThisLoc) {
|
||||
|
||||
protected <T extends Feature> List<T> getValues(final Class<T> type, final String name, final GenomeLoc onlyAtThisLoc) {
|
||||
return addValues(name, type, new ArrayList<T>(), getTrackDataByName(name), onlyAtThisLoc, true, false);
|
||||
}
|
||||
@Deprecated
|
||||
public <T extends Feature> T getFirstValue(final Class<T> type, final String name) {
|
||||
|
||||
protected <T extends Feature> T getFirstValue(final Class<T> type, final String name) {
|
||||
return safeGetFirst(getValues(type, name));
|
||||
}
|
||||
@Deprecated
|
||||
public <T extends Feature> T getFirstValue(final Class<T> type, final String name, final GenomeLoc onlyAtThisLoc) {
|
||||
|
||||
protected <T extends Feature> T getFirstValue(final Class<T> type, final String name, final GenomeLoc onlyAtThisLoc) {
|
||||
return safeGetFirst(getValues(type, name, onlyAtThisLoc));
|
||||
}
|
||||
|
||||
|
|
@ -366,7 +353,7 @@ public class RefMetaDataTracker {
|
|||
* @return
|
||||
*/
|
||||
@Requires({"l != null"})
|
||||
final private <T extends Feature> T safeGetFirst(final List<T> l) {
|
||||
private <T extends Feature> T safeGetFirst(final List<T> l) {
|
||||
return l.isEmpty() ? null : l.get(0);
|
||||
}
|
||||
|
||||
|
|
@ -435,7 +422,7 @@ public class RefMetaDataTracker {
|
|||
*/
|
||||
private RODRecordList getTrackDataByName(final String name) {
|
||||
final String luName = canonicalName(name);
|
||||
RODRecordList l = map.get(luName);
|
||||
RODRecordList l = bindings.get(luName);
|
||||
return l == null ? EMPTY_ROD_RECORD_LIST : l;
|
||||
}
|
||||
|
||||
|
|
@ -448,7 +435,7 @@ public class RefMetaDataTracker {
|
|||
* @param name the name of the rod
|
||||
* @return canonical name of the rod
|
||||
*/
|
||||
private final String canonicalName(final String name) {
|
||||
private String canonicalName(final String name) {
|
||||
// todo -- remove me after switch to RodBinding syntax
|
||||
return name.toLowerCase();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.resourcemanagement;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
/**
|
||||
* Models how threads are distributed between various components of the GATK.
|
||||
|
|
@ -33,61 +33,83 @@ public class ThreadAllocation {
|
|||
/**
|
||||
* The number of CPU threads to be used by the GATK.
|
||||
*/
|
||||
private final int numCPUThreads;
|
||||
private final int numDataThreads;
|
||||
|
||||
/**
|
||||
* The number of CPU threads per data thread for GATK processing
|
||||
*/
|
||||
private final int numCPUThreadsPerDataThread;
|
||||
|
||||
/**
|
||||
* Number of threads to devote exclusively to IO. Default is 0.
|
||||
*/
|
||||
private final int numIOThreads;
|
||||
|
||||
public int getNumCPUThreads() {
|
||||
return numCPUThreads;
|
||||
/**
|
||||
* Should we monitor thread efficiency?
|
||||
*/
|
||||
private final boolean monitorEfficiency;
|
||||
|
||||
public int getNumDataThreads() {
|
||||
return numDataThreads;
|
||||
}
|
||||
|
||||
public int getNumCPUThreadsPerDataThread() {
|
||||
return numCPUThreadsPerDataThread;
|
||||
}
|
||||
|
||||
public int getNumIOThreads() {
|
||||
return numIOThreads;
|
||||
}
|
||||
|
||||
public boolean monitorThreadEfficiency() {
|
||||
return monitorEfficiency;
|
||||
}
|
||||
|
||||
/**
|
||||
* Are we running in parallel mode?
|
||||
*
|
||||
* @return true if any parallel processing is enabled
|
||||
*/
|
||||
public boolean isRunningInParallelMode() {
|
||||
return getTotalNumThreads() > 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* What is the total number of threads in use by the GATK?
|
||||
*
|
||||
* @return the sum of all thread allocations in this object
|
||||
*/
|
||||
public int getTotalNumThreads() {
|
||||
return getNumDataThreads() * getNumCPUThreadsPerDataThread() + getNumIOThreads();
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct the default thread allocation.
|
||||
*/
|
||||
public ThreadAllocation() {
|
||||
this(1,null,null);
|
||||
this(1, 1, 0, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set up the thread allocation. Default allocation is 1 CPU thread, 0 IO threads.
|
||||
* (0 IO threads means that no threads are devoted exclusively to IO; they're inline on the CPU thread).
|
||||
* @param totalThreads Complete number of threads to allocate.
|
||||
* @param numCPUThreads Total number of threads allocated to the traversal.
|
||||
* @param numDataThreads Total number of threads allocated to the traversal.
|
||||
* @param numCPUThreadsPerDataThread The number of CPU threads per data thread to allocate
|
||||
* @param numIOThreads Total number of threads allocated exclusively to IO.
|
||||
* @param monitorEfficiency should we monitor threading efficiency in the GATK?
|
||||
*/
|
||||
public ThreadAllocation(final int totalThreads, final Integer numCPUThreads, final Integer numIOThreads) {
|
||||
// If no allocation information is present, allocate all threads to CPU
|
||||
if(numCPUThreads == null && numIOThreads == null) {
|
||||
this.numCPUThreads = totalThreads;
|
||||
this.numIOThreads = 0;
|
||||
}
|
||||
// If only CPU threads are specified, allocate remainder to IO (minimum 0 dedicated IO threads).
|
||||
else if(numIOThreads == null) {
|
||||
if(numCPUThreads > totalThreads)
|
||||
throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) is higher than the total threads",totalThreads,numCPUThreads));
|
||||
this.numCPUThreads = numCPUThreads;
|
||||
this.numIOThreads = totalThreads - numCPUThreads;
|
||||
}
|
||||
// If only IO threads are specified, allocate remainder to CPU (minimum 1 dedicated CPU thread).
|
||||
else if(numCPUThreads == null) {
|
||||
if(numIOThreads > totalThreads)
|
||||
throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of io threads (%d) is higher than the total threads",totalThreads,numIOThreads));
|
||||
this.numCPUThreads = Math.max(1,totalThreads-numIOThreads);
|
||||
this.numIOThreads = numIOThreads;
|
||||
}
|
||||
else {
|
||||
if(numCPUThreads + numIOThreads != totalThreads)
|
||||
throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) + the count of io threads (%d) does not match",totalThreads,numCPUThreads,numIOThreads));
|
||||
this.numCPUThreads = numCPUThreads;
|
||||
this.numIOThreads = numIOThreads;
|
||||
}
|
||||
}
|
||||
public ThreadAllocation(final int numDataThreads,
|
||||
final int numCPUThreadsPerDataThread,
|
||||
final int numIOThreads,
|
||||
final boolean monitorEfficiency) {
|
||||
if ( numDataThreads < 1 ) throw new ReviewedStingException("numDataThreads cannot be less than 1, but saw " + numDataThreads);
|
||||
if ( numCPUThreadsPerDataThread < 1 ) throw new ReviewedStingException("numCPUThreadsPerDataThread cannot be less than 1, but saw " + numCPUThreadsPerDataThread);
|
||||
if ( numIOThreads < 0 ) throw new ReviewedStingException("numIOThreads cannot be less than 0, but saw " + numIOThreads);
|
||||
|
||||
this.numDataThreads = numDataThreads;
|
||||
this.numCPUThreadsPerDataThread = numCPUThreadsPerDataThread;
|
||||
this.numIOThreads = numIOThreads;
|
||||
this.monitorEfficiency = monitorEfficiency;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -168,13 +168,70 @@ public class SampleDB {
|
|||
return families;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns all the trios present in the sample database. The strictOneChild parameter determines
|
||||
* whether multiple children of the same parents resolve to multiple trios, or are excluded
|
||||
* @param strictOneChild - exclude pedigrees with >1 child for parental pair
|
||||
* @return - all of the mother+father=child triplets, subject to strictOneChild
|
||||
*/
|
||||
public final Set<Trio> getTrios(boolean strictOneChild) {
|
||||
Set<Trio> trioSet = new HashSet<Trio>();
|
||||
for ( String familyString : getFamilyIDs() ) {
|
||||
Set<Sample> family = getFamily(familyString);
|
||||
for ( Sample sample : family) {
|
||||
if ( sample.getParents().size() == 2 ) {
|
||||
Trio trio = new Trio(sample.getMother(),sample.getFather(),sample);
|
||||
trioSet.add(trio);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( strictOneChild )
|
||||
trioSet = removeTriosWithSameParents(trioSet);
|
||||
|
||||
return trioSet;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns all the trios present in the db. See getTrios(boolean strictOneChild)
|
||||
* @return all the trios present in the samples db.
|
||||
*/
|
||||
public final Set<Trio> getTrios() {
|
||||
return getTrios(false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Subsets a set of trios to only those with nonmatching founders. If two (or more) trio objects have
|
||||
* the same mother and father, then both (all) are removed from the returned set.
|
||||
* @param trios - a set of Trio objects
|
||||
* @return those subset of Trio objects in the input set with nonmatching founders
|
||||
*/
|
||||
private Set<Trio> removeTriosWithSameParents(final Set<Trio> trios) {
|
||||
Set<Trio> filteredTrios = new HashSet<Trio>();
|
||||
filteredTrios.addAll(trios);
|
||||
Set<Trio> triosWithSameParents = new HashSet<Trio>();
|
||||
for ( Trio referenceTrio : filteredTrios ) {
|
||||
for ( Trio compareTrio : filteredTrios ) {
|
||||
if ( referenceTrio != compareTrio &&
|
||||
referenceTrio.getFather().equals(compareTrio.getFather()) &&
|
||||
referenceTrio.getMother().equals(compareTrio.getMother()) ) {
|
||||
triosWithSameParents.add(referenceTrio);
|
||||
triosWithSameParents.add(compareTrio);
|
||||
}
|
||||
}
|
||||
}
|
||||
filteredTrios.removeAll(triosWithSameParents);
|
||||
return filteredTrios;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the set of all children that have both of their parents.
|
||||
* Note that if a family is composed of more than 1 child, each child is
|
||||
* returned.
|
||||
* @return - all the children that have both of their parents
|
||||
* @deprecated - getTrios() replaces this function
|
||||
*/
|
||||
@Deprecated
|
||||
public final Set<Sample> getChildrenWithParents(){
|
||||
return getChildrenWithParents(false);
|
||||
}
|
||||
|
|
@ -188,7 +245,15 @@ public class SampleDB {
|
|||
*
|
||||
* @param triosOnly - if set to true, only strict trios are returned
|
||||
* @return - all the children that have both of their parents
|
||||
* @deprecated - getTrios(boolean strict) replaces this function
|
||||
* @bug -- does not work for extracting multiple generations of trios, e.g.
|
||||
* ..........Mom1------Dad1
|
||||
* ................|
|
||||
* ..............Child1--------Mom2
|
||||
* .......................|
|
||||
* .....................Child2
|
||||
*/
|
||||
@Deprecated
|
||||
public final Set<Sample> getChildrenWithParents(boolean triosOnly) {
|
||||
|
||||
Map<String, Set<Sample>> families = getFamilies();
|
||||
|
|
|
|||
|
|
@ -135,9 +135,8 @@ public class SampleDBBuilder {
|
|||
// --------------------------------------------------------------------------------
|
||||
|
||||
protected final void validate() {
|
||||
if ( validationStrictness == PedigreeValidationType.SILENT )
|
||||
return;
|
||||
else {
|
||||
validatePedigreeIDUniqueness();
|
||||
if ( validationStrictness != PedigreeValidationType.SILENT ) {
|
||||
// check that samples in data sources are all annotated, if anything is annotated
|
||||
if ( ! samplesFromPedigrees.isEmpty() && ! samplesFromDataSources.isEmpty() ) {
|
||||
final Set<String> sampleNamesFromPedigrees = new HashSet<String>();
|
||||
|
|
@ -150,4 +149,12 @@ public class SampleDBBuilder {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void validatePedigreeIDUniqueness() {
|
||||
Set<String> pedigreeIDs = new HashSet<String>();
|
||||
for ( Sample sample : samplesFromPedigrees ) {
|
||||
pedigreeIDs.add(sample.getID());
|
||||
}
|
||||
assert pedigreeIDs.size() == samplesFromPedigrees.size() : "The number of sample IDs extracted from the pedigree does not equal the number of samples in the pedigree. Is a sample associated with multiple families?";
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,45 @@
|
|||
package org.broadinstitute.sting.gatk.samples;
|
||||
|
||||
/**
|
||||
* A class for imposing a trio structure on three samples; a common paradigm
|
||||
*
|
||||
* todo -- there should probably be an interface or abstract class "Pedigree" that generalizes the notion of
|
||||
* -- imposing structure on samples. But given how complex pedigrees can quickly become, it's not
|
||||
* -- clear the best way to do this.
|
||||
*/
|
||||
public class Trio {
|
||||
private Sample mother;
|
||||
private Sample father;
|
||||
private Sample child;
|
||||
|
||||
public Trio(Sample mom, Sample dad, Sample spawn) {
|
||||
assert mom.getID().equals(spawn.getMaternalID()) && dad.getID().equals(spawn.getPaternalID()) : "Samples passed to trio constructor do not form a trio";
|
||||
mother = mom;
|
||||
father = dad;
|
||||
child = spawn;
|
||||
}
|
||||
|
||||
public Sample getMother() {
|
||||
return mother;
|
||||
}
|
||||
|
||||
public String getMaternalID() {
|
||||
return mother.getID();
|
||||
}
|
||||
|
||||
public Sample getFather() {
|
||||
return father;
|
||||
}
|
||||
|
||||
public String getPaternalID() {
|
||||
return father.getID();
|
||||
}
|
||||
|
||||
public Sample getChild() {
|
||||
return child;
|
||||
}
|
||||
|
||||
public String getChildID() {
|
||||
return child.getID();
|
||||
}
|
||||
}
|
||||
|
|
@ -44,24 +44,12 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,ProviderType extends ShardDataProvider> {
|
||||
/** our log, which we want to capture anything from this class */
|
||||
protected static final Logger logger = Logger.getLogger(TraversalEngine.class);
|
||||
|
||||
// Time in milliseconds since we initialized this engine
|
||||
private static final int HISTORY_WINDOW_SIZE = 50;
|
||||
|
||||
private static class ProcessingHistory {
|
||||
double elapsedSeconds;
|
||||
long unitsProcessed;
|
||||
long bpProcessed;
|
||||
GenomeLoc loc;
|
||||
|
||||
public ProcessingHistory(double elapsedSeconds, GenomeLoc loc, long unitsProcessed, long bpProcessed) {
|
||||
this.elapsedSeconds = elapsedSeconds;
|
||||
this.loc = loc;
|
||||
this.unitsProcessed = unitsProcessed;
|
||||
this.bpProcessed = bpProcessed;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/** lock object to sure updates to history are consistent across threads */
|
||||
private static final Object lock = new Object();
|
||||
LinkedList<ProcessingHistory> history = new LinkedList<ProcessingHistory>();
|
||||
|
|
@ -70,13 +58,12 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
|
|||
private SimpleTimer timer = null;
|
||||
|
||||
// How long can we go without printing some progress info?
|
||||
private static final int PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES = 1000;
|
||||
private int printProgressCheckCounter = 0;
|
||||
private long lastProgressPrintTime = -1; // When was the last time we printed progress log?
|
||||
private long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 30 * 1000; // in milliseconds
|
||||
private long PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds
|
||||
private final double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0;
|
||||
private final double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0;
|
||||
|
||||
private final static long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 30 * 1000; // in milliseconds
|
||||
private final static double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0;
|
||||
private final static double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0;
|
||||
private long progressPrintFrequency = 10 * 1000; // in milliseconds
|
||||
private boolean progressMeterInitialized = false;
|
||||
|
||||
// for performance log
|
||||
|
|
@ -85,15 +72,12 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
|
|||
private File performanceLogFile;
|
||||
private PrintStream performanceLog = null;
|
||||
private long lastPerformanceLogPrintTime = -1; // When was the last time we printed to the performance log?
|
||||
private final long PERFORMANCE_LOG_PRINT_FREQUENCY = PROGRESS_PRINT_FREQUENCY; // in milliseconds
|
||||
private final long PERFORMANCE_LOG_PRINT_FREQUENCY = progressPrintFrequency; // in milliseconds
|
||||
|
||||
/** Size, in bp, of the area we are processing. Updated once in the system in initial for performance reasons */
|
||||
long targetSize = -1;
|
||||
GenomeLocSortedSet targetIntervals = null;
|
||||
|
||||
/** our log, which we want to capture anything from this class */
|
||||
protected static final Logger logger = Logger.getLogger(TraversalEngine.class);
|
||||
|
||||
protected GenomeAnalysisEngine engine;
|
||||
|
||||
// ----------------------------------------------------------------------------------------------------
|
||||
|
|
@ -186,15 +170,35 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
|
|||
return elapsed > printFreq && elapsed > MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the cumulative traversal metrics according to the data in this shard
|
||||
*
|
||||
* @param shard a non-null shard
|
||||
*/
|
||||
public void updateCumulativeMetrics(final Shard shard) {
|
||||
updateCumulativeMetrics(shard.getReadMetrics());
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the cumulative traversal metrics according to the data in this shard
|
||||
*
|
||||
* @param singleTraverseMetrics read metrics object containing the information about a single shard's worth
|
||||
* of data processing
|
||||
*/
|
||||
public void updateCumulativeMetrics(final ReadMetrics singleTraverseMetrics) {
|
||||
engine.getCumulativeMetrics().incrementMetrics(singleTraverseMetrics);
|
||||
}
|
||||
|
||||
/**
|
||||
* Forward request to printProgress
|
||||
*
|
||||
* @param shard the given shard currently being processed.
|
||||
* Assumes that one cycle has been completed
|
||||
*
|
||||
* @param loc the location
|
||||
*/
|
||||
public void printProgress(Shard shard, GenomeLoc loc) {
|
||||
public void printProgress(final GenomeLoc loc) {
|
||||
// A bypass is inserted here for unit testing.
|
||||
printProgress(loc,shard.getReadMetrics(),false);
|
||||
printProgress(loc, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -202,15 +206,10 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
|
|||
* every M seconds, for N and M set in global variables.
|
||||
*
|
||||
* @param loc Current location, can be null if you are at the end of the traversal
|
||||
* @param metrics Data processed since the last cumulative
|
||||
* @param mustPrint If true, will print out info, regardless of nRecords or time interval
|
||||
*/
|
||||
private void printProgress(GenomeLoc loc, ReadMetrics metrics, boolean mustPrint) {
|
||||
if ( mustPrint || printProgressCheckCounter++ % PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES != 0 )
|
||||
// don't do any work more often than PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES
|
||||
return;
|
||||
|
||||
if(!progressMeterInitialized && mustPrint == false ) {
|
||||
private synchronized void printProgress(final GenomeLoc loc, boolean mustPrint) {
|
||||
if( ! progressMeterInitialized ) {
|
||||
logger.info("[INITIALIZATION COMPLETE; TRAVERSAL STARTING]");
|
||||
logger.info(String.format("%15s processed.%s runtime per.1M.%s completed total.runtime remaining",
|
||||
"Location", getTraversalType(), getTraversalType()));
|
||||
|
|
@ -218,40 +217,34 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
|
|||
}
|
||||
|
||||
final long curTime = timer.currentTime();
|
||||
boolean printProgress = mustPrint || maxElapsedIntervalForPrinting(curTime, lastProgressPrintTime, PROGRESS_PRINT_FREQUENCY);
|
||||
boolean printProgress = mustPrint || maxElapsedIntervalForPrinting(curTime, lastProgressPrintTime, progressPrintFrequency);
|
||||
boolean printLog = performanceLog != null && maxElapsedIntervalForPrinting(curTime, lastPerformanceLogPrintTime, PERFORMANCE_LOG_PRINT_FREQUENCY);
|
||||
|
||||
if ( printProgress || printLog ) {
|
||||
// getting and appending metrics data actually turns out to be quite a heavyweight
|
||||
// operation. Postpone it until after determining whether to print the log message.
|
||||
ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics() != null ? engine.getCumulativeMetrics() : new ReadMetrics();
|
||||
if(metrics != null)
|
||||
cumulativeMetrics.incrementMetrics(metrics);
|
||||
|
||||
final long nRecords = cumulativeMetrics.getNumIterations();
|
||||
|
||||
ProcessingHistory last = updateHistory(loc,cumulativeMetrics);
|
||||
final ProcessingHistory last = updateHistory(loc, engine.getCumulativeMetrics());
|
||||
|
||||
final AutoFormattingTime elapsed = new AutoFormattingTime(last.elapsedSeconds);
|
||||
final AutoFormattingTime bpRate = new AutoFormattingTime(secondsPerMillionBP(last));
|
||||
final AutoFormattingTime unitRate = new AutoFormattingTime(secondsPerMillionElements(last));
|
||||
final double fractionGenomeTargetCompleted = calculateFractionGenomeTargetCompleted(last);
|
||||
final AutoFormattingTime bpRate = new AutoFormattingTime(last.secondsPerMillionBP());
|
||||
final AutoFormattingTime unitRate = new AutoFormattingTime(last.secondsPerMillionElements());
|
||||
final double fractionGenomeTargetCompleted = last.calculateFractionGenomeTargetCompleted(targetSize);
|
||||
final AutoFormattingTime estTotalRuntime = new AutoFormattingTime(elapsed.getTimeInSeconds() / fractionGenomeTargetCompleted);
|
||||
final AutoFormattingTime timeToCompletion = new AutoFormattingTime(estTotalRuntime.getTimeInSeconds() - elapsed.getTimeInSeconds());
|
||||
final long nRecords = engine.getCumulativeMetrics().getNumIterations();
|
||||
|
||||
if ( printProgress ) {
|
||||
lastProgressPrintTime = curTime;
|
||||
|
||||
// dynamically change the update rate so that short running jobs receive frequent updates while longer jobs receive fewer updates
|
||||
if ( estTotalRuntime.getTimeInSeconds() > TWELVE_HOURS_IN_SECONDS )
|
||||
PROGRESS_PRINT_FREQUENCY = 60 * 1000; // in milliseconds
|
||||
progressPrintFrequency = 60 * 1000; // in milliseconds
|
||||
else if ( estTotalRuntime.getTimeInSeconds() > TWO_HOURS_IN_SECONDS )
|
||||
PROGRESS_PRINT_FREQUENCY = 30 * 1000; // in milliseconds
|
||||
progressPrintFrequency = 30 * 1000; // in milliseconds
|
||||
else
|
||||
PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds
|
||||
progressPrintFrequency = 10 * 1000; // in milliseconds
|
||||
|
||||
logger.info(String.format("%15s %5.2e %s %s %4.1f%% %s %s",
|
||||
loc == null ? "done with mapped reads" : loc, nRecords*1.0, elapsed, unitRate,
|
||||
final String posName = loc == null ? (mustPrint ? "done" : "unmapped reads") : String.format("%s:%d", loc.getContig(), loc.getStart());
|
||||
logger.info(String.format("%15s %5.2e %s %s %5.1f%% %s %s",
|
||||
posName, nRecords*1.0, elapsed, unitRate,
|
||||
100*fractionGenomeTargetCompleted, estTotalRuntime, timeToCompletion));
|
||||
|
||||
}
|
||||
|
|
@ -277,7 +270,7 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
|
|||
* @param metrics information about what's been processed already
|
||||
* @return
|
||||
*/
|
||||
private final ProcessingHistory updateHistory(GenomeLoc loc, ReadMetrics metrics) {
|
||||
private ProcessingHistory updateHistory(GenomeLoc loc, ReadMetrics metrics) {
|
||||
synchronized (lock) {
|
||||
if ( history.size() > HISTORY_WINDOW_SIZE )
|
||||
history.pop();
|
||||
|
|
@ -290,26 +283,11 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
|
|||
}
|
||||
}
|
||||
|
||||
/** How long in seconds to process 1M traversal units? */
|
||||
private final double secondsPerMillionElements(ProcessingHistory last) {
|
||||
return (last.elapsedSeconds * 1000000.0) / Math.max(last.unitsProcessed, 1);
|
||||
}
|
||||
|
||||
/** How long in seconds to process 1M bp on the genome? */
|
||||
private final double secondsPerMillionBP(ProcessingHistory last) {
|
||||
return (last.elapsedSeconds * 1000000.0) / Math.max(last.bpProcessed, 1);
|
||||
}
|
||||
|
||||
/** What fractoin of the target intervals have we covered? */
|
||||
private final double calculateFractionGenomeTargetCompleted(ProcessingHistory last) {
|
||||
return (1.0*last.bpProcessed) / targetSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Called after a traversal to print out information about the traversal process
|
||||
*/
|
||||
public void printOnTraversalDone() {
|
||||
printProgress(null, null, true);
|
||||
printProgress(null, true);
|
||||
|
||||
final double elapsed = timer == null ? 0 : timer.getElapsedTime();
|
||||
|
||||
|
|
@ -370,7 +348,7 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
|
|||
* @return Frequency, in seconds, of performance log writes.
|
||||
*/
|
||||
public long getPerformanceProgressPrintFrequencySeconds() {
|
||||
return PROGRESS_PRINT_FREQUENCY;
|
||||
return progressPrintFrequency;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -378,6 +356,35 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
|
|||
* @param seconds number of seconds between messages indicating performance frequency.
|
||||
*/
|
||||
public void setPerformanceProgressPrintFrequencySeconds(long seconds) {
|
||||
PROGRESS_PRINT_FREQUENCY = seconds;
|
||||
progressPrintFrequency = seconds;
|
||||
}
|
||||
|
||||
private static class ProcessingHistory {
|
||||
double elapsedSeconds;
|
||||
long unitsProcessed;
|
||||
long bpProcessed;
|
||||
GenomeLoc loc;
|
||||
|
||||
public ProcessingHistory(double elapsedSeconds, GenomeLoc loc, long unitsProcessed, long bpProcessed) {
|
||||
this.elapsedSeconds = elapsedSeconds;
|
||||
this.loc = loc;
|
||||
this.unitsProcessed = unitsProcessed;
|
||||
this.bpProcessed = bpProcessed;
|
||||
}
|
||||
|
||||
/** How long in seconds to process 1M traversal units? */
|
||||
private double secondsPerMillionElements() {
|
||||
return (elapsedSeconds * 1000000.0) / Math.max(unitsProcessed, 1);
|
||||
}
|
||||
|
||||
/** How long in seconds to process 1M bp on the genome? */
|
||||
private double secondsPerMillionBP() {
|
||||
return (elapsedSeconds * 1000000.0) / Math.max(bpProcessed, 1);
|
||||
}
|
||||
|
||||
/** What fractoin of the target intervals have we covered? */
|
||||
private double calculateFractionGenomeTargetCompleted(final long targetSize) {
|
||||
return (1.0*bpProcessed) / targetSize;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -104,7 +104,8 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
|
||||
prevLoc = location;
|
||||
|
||||
printProgress(dataProvider.getShard(), locus.getLocation());
|
||||
updateCumulativeMetrics(dataProvider.getShard());
|
||||
printProgress(locus.getLocation());
|
||||
}
|
||||
|
||||
// Take the individual isActive calls and integrate them into contiguous active regions and
|
||||
|
|
@ -185,7 +186,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
*/
|
||||
private void writeActiveRegionsToStream( final ActiveRegionWalker<M,T> walker ) {
|
||||
// Just want to output the active regions to a file, not actually process them
|
||||
for( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion : workQueue ) {
|
||||
for( final ActiveRegion activeRegion : workQueue ) {
|
||||
if( activeRegion.isActive ) {
|
||||
walker.activeRegionOutStream.println( activeRegion.getLocation() );
|
||||
}
|
||||
|
|
@ -198,7 +199,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
while( workQueue.peek() != null ) {
|
||||
final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc();
|
||||
if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) {
|
||||
final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion = workQueue.remove();
|
||||
final ActiveRegion activeRegion = workQueue.remove();
|
||||
sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker );
|
||||
} else {
|
||||
break;
|
||||
|
|
@ -208,15 +209,15 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
return sum;
|
||||
}
|
||||
|
||||
private T processActiveRegion( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion, final LinkedHashSet<GATKSAMRecord> reads, final Queue<org.broadinstitute.sting.utils.activeregion.ActiveRegion> workQueue, final T sum, final ActiveRegionWalker<M,T> walker ) {
|
||||
private T processActiveRegion( final ActiveRegion activeRegion, final LinkedHashSet<GATKSAMRecord> reads, final Queue<ActiveRegion> workQueue, final T sum, final ActiveRegionWalker<M,T> walker ) {
|
||||
final ArrayList<GATKSAMRecord> placedReads = new ArrayList<GATKSAMRecord>();
|
||||
for( final GATKSAMRecord read : reads ) {
|
||||
final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read );
|
||||
if( activeRegion.getLocation().overlapsP( readLoc ) ) {
|
||||
// The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region)
|
||||
long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc );
|
||||
org.broadinstitute.sting.utils.activeregion.ActiveRegion bestRegion = activeRegion;
|
||||
for( final org.broadinstitute.sting.utils.activeregion.ActiveRegion otherRegionToTest : workQueue ) {
|
||||
ActiveRegion bestRegion = activeRegion;
|
||||
for( final ActiveRegion otherRegionToTest : workQueue ) {
|
||||
if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) {
|
||||
maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc );
|
||||
bestRegion = otherRegionToTest;
|
||||
|
|
@ -229,7 +230,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
if( !bestRegion.equals(activeRegion) ) {
|
||||
activeRegion.add( read );
|
||||
}
|
||||
for( final org.broadinstitute.sting.utils.activeregion.ActiveRegion otherRegionToTest : workQueue ) {
|
||||
for( final ActiveRegion otherRegionToTest : workQueue ) {
|
||||
if( !bestRegion.equals(otherRegionToTest) && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) {
|
||||
otherRegionToTest.add( read );
|
||||
}
|
||||
|
|
@ -241,6 +242,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
}
|
||||
}
|
||||
reads.removeAll( placedReads ); // remove all the reads which have been placed into their active region
|
||||
// WARNING: This hashset relies on reads being exactly equal when they are placed in the list as when they are removed. So the ActiveRegionWalker can't modify the reads in any way.
|
||||
|
||||
logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc());
|
||||
final M x = walker.map( activeRegion, null );
|
||||
|
|
|
|||
|
|
@ -196,7 +196,8 @@ public class TraverseDuplicates<M,T> extends TraversalEngine<M,T,DuplicateWalker
|
|||
sum = walker.reduce(x, sum);
|
||||
}
|
||||
|
||||
printProgress(dataProvider.getShard(),site);
|
||||
updateCumulativeMetrics(dataProvider.getShard());
|
||||
printProgress(site);
|
||||
done = walker.isDone();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -3,9 +3,7 @@ package org.broadinstitute.sting.gatk.traversals;
|
|||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.WalkerManager;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
|
|
@ -15,28 +13,42 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
|||
/**
|
||||
* A simple solution to iterating over all reference positions over a series of genomic locations.
|
||||
*/
|
||||
public class TraverseLoci<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,LocusShardDataProvider> {
|
||||
public abstract class TraverseLociBase<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,LocusShardDataProvider> {
|
||||
/**
|
||||
* our log, which we want to capture anything from this class
|
||||
*/
|
||||
protected static final Logger logger = Logger.getLogger(TraversalEngine.class);
|
||||
|
||||
@Override
|
||||
protected String getTraversalType() {
|
||||
protected final String getTraversalType() {
|
||||
return "sites";
|
||||
}
|
||||
|
||||
protected static class TraverseResults<T> {
|
||||
final int numIterations;
|
||||
final T reduceResult;
|
||||
|
||||
public TraverseResults(int numIterations, T reduceResult) {
|
||||
this.numIterations = numIterations;
|
||||
this.reduceResult = reduceResult;
|
||||
}
|
||||
}
|
||||
|
||||
protected abstract TraverseResults<T> traverse( final LocusWalker<M,T> walker,
|
||||
final LocusView locusView,
|
||||
final LocusReferenceView referenceView,
|
||||
final ReferenceOrderedView referenceOrderedDataView,
|
||||
final T sum);
|
||||
|
||||
@Override
|
||||
public T traverse( LocusWalker<M,T> walker,
|
||||
LocusShardDataProvider dataProvider,
|
||||
T sum) {
|
||||
logger.debug(String.format("TraverseLoci.traverse: Shard is %s", dataProvider));
|
||||
logger.debug(String.format("TraverseLociBase.traverse: Shard is %s", dataProvider));
|
||||
|
||||
LocusView locusView = getLocusView( walker, dataProvider );
|
||||
boolean done = false;
|
||||
final LocusView locusView = getLocusView( walker, dataProvider );
|
||||
|
||||
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
|
||||
|
||||
//ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider );
|
||||
ReferenceOrderedView referenceOrderedDataView = null;
|
||||
if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA )
|
||||
|
|
@ -44,43 +56,24 @@ public class TraverseLoci<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,Locu
|
|||
else
|
||||
referenceOrderedDataView = (RodLocusView)locusView;
|
||||
|
||||
LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
|
||||
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
|
||||
|
||||
// We keep processing while the next reference location is within the interval
|
||||
while( locusView.hasNext() && ! done ) {
|
||||
AlignmentContext locus = locusView.next();
|
||||
GenomeLoc location = locus.getLocation();
|
||||
|
||||
dataProvider.getShard().getReadMetrics().incrementNumIterations();
|
||||
|
||||
// create reference context. Note that if we have a pileup of "extended events", the context will
|
||||
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
|
||||
ReferenceContext refContext = referenceView.getReferenceContext(location);
|
||||
|
||||
// Iterate forward to get all reference ordered data covering this location
|
||||
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
|
||||
|
||||
final boolean keepMeP = walker.filter(tracker, refContext, locus);
|
||||
if (keepMeP) {
|
||||
M x = walker.map(tracker, refContext, locus);
|
||||
sum = walker.reduce(x, sum);
|
||||
done = walker.isDone();
|
||||
}
|
||||
|
||||
printProgress(dataProvider.getShard(),locus.getLocation());
|
||||
}
|
||||
final TraverseResults<T> result = traverse( walker, locusView, referenceView, referenceOrderedDataView, sum );
|
||||
sum = result.reduceResult;
|
||||
dataProvider.getShard().getReadMetrics().incrementNumIterations(result.numIterations);
|
||||
updateCumulativeMetrics(dataProvider.getShard());
|
||||
}
|
||||
|
||||
// We have a final map call to execute here to clean up the skipped based from the
|
||||
// last position in the ROD to that in the interval
|
||||
if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) {
|
||||
// only do this if the walker isn't done!
|
||||
RodLocusView rodLocusView = (RodLocusView)locusView;
|
||||
long nSkipped = rodLocusView.getLastSkippedBases();
|
||||
final RodLocusView rodLocusView = (RodLocusView)locusView;
|
||||
final long nSkipped = rodLocusView.getLastSkippedBases();
|
||||
if ( nSkipped > 0 ) {
|
||||
GenomeLoc site = rodLocusView.getLocOneBeyondShard();
|
||||
AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped);
|
||||
M x = walker.map(null, null, ac);
|
||||
final GenomeLoc site = rodLocusView.getLocOneBeyondShard();
|
||||
final AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped);
|
||||
final M x = walker.map(null, null, ac);
|
||||
sum = walker.reduce(x, sum);
|
||||
}
|
||||
}
|
||||
|
|
@ -90,14 +83,14 @@ public class TraverseLoci<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,Locu
|
|||
|
||||
/**
|
||||
* Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track'
|
||||
* of sorts, providing a consistent interface so that TraverseLoci doesn't need to be reimplemented for any new datatype
|
||||
* of sorts, providing a consistent interface so that TraverseLociBase doesn't need to be reimplemented for any new datatype
|
||||
* that comes along.
|
||||
* @param walker walker to interrogate.
|
||||
* @param dataProvider Data which which to drive the locus view.
|
||||
* @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal.
|
||||
*/
|
||||
private LocusView getLocusView( Walker<M,T> walker, LocusShardDataProvider dataProvider ) {
|
||||
DataSource dataSource = WalkerManager.getWalkerDataSource(walker);
|
||||
final DataSource dataSource = WalkerManager.getWalkerDataSource(walker);
|
||||
if( dataSource == DataSource.READS )
|
||||
return new CoveredLocusView(dataProvider);
|
||||
else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers )
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.LocusView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
/**
|
||||
* A simple solution to iterating over all reference positions over a series of genomic locations.
|
||||
*/
|
||||
public class TraverseLociLinear<M,T> extends TraverseLociBase<M,T> {
|
||||
|
||||
@Override
|
||||
protected TraverseResults<T> traverse(LocusWalker<M, T> walker, LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView, T sum) {
|
||||
// We keep processing while the next reference location is within the interval
|
||||
boolean done = false;
|
||||
int numIterations = 0;
|
||||
|
||||
while( locusView.hasNext() && ! done ) {
|
||||
numIterations++;
|
||||
final AlignmentContext locus = locusView.next();
|
||||
final GenomeLoc location = locus.getLocation();
|
||||
|
||||
// create reference context. Note that if we have a pileup of "extended events", the context will
|
||||
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
|
||||
final ReferenceContext refContext = referenceView.getReferenceContext(location);
|
||||
|
||||
// Iterate forward to get all reference ordered data covering this location
|
||||
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
|
||||
|
||||
final boolean keepMeP = walker.filter(tracker, refContext, locus);
|
||||
if (keepMeP) {
|
||||
final M x = walker.map(tracker, refContext, locus);
|
||||
sum = walker.reduce(x, sum);
|
||||
done = walker.isDone();
|
||||
}
|
||||
|
||||
printProgress(locus.getLocation());
|
||||
}
|
||||
|
||||
return new TraverseResults<T>(numIterations, sum);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,205 @@
|
|||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.LocusView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.nanoScheduler.NSMapFunction;
|
||||
import org.broadinstitute.sting.utils.nanoScheduler.NSProgressFunction;
|
||||
import org.broadinstitute.sting.utils.nanoScheduler.NSReduceFunction;
|
||||
import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* A simple solution to iterating over all reference positions over a series of genomic locations.
|
||||
*/
|
||||
public class TraverseLociNano<M,T> extends TraverseLociBase<M,T> {
|
||||
/** our log, which we want to capture anything from this class */
|
||||
private static final boolean DEBUG = false;
|
||||
private static final int BUFFER_SIZE = 1000;
|
||||
|
||||
final NanoScheduler<MapData, MapResult, T> nanoScheduler;
|
||||
|
||||
public TraverseLociNano(int nThreads) {
|
||||
nanoScheduler = new NanoScheduler<MapData, MapResult, T>(BUFFER_SIZE, nThreads);
|
||||
nanoScheduler.setProgressFunction(new TraverseLociProgress());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TraverseResults<T> traverse(final LocusWalker<M, T> walker,
|
||||
final LocusView locusView,
|
||||
final LocusReferenceView referenceView,
|
||||
final ReferenceOrderedView referenceOrderedDataView,
|
||||
final T sum) {
|
||||
nanoScheduler.setDebug(DEBUG);
|
||||
final TraverseLociMap myMap = new TraverseLociMap(walker);
|
||||
final TraverseLociReduce myReduce = new TraverseLociReduce(walker);
|
||||
|
||||
final MapDataIterator inputIterator = new MapDataIterator(locusView, referenceView, referenceOrderedDataView);
|
||||
final T result = nanoScheduler.execute(inputIterator, myMap, sum, myReduce);
|
||||
|
||||
return new TraverseResults<T>(inputIterator.numIterations, result);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create iterator that provides inputs for all map calls into MapData, to be provided
|
||||
* to NanoScheduler for Map/Reduce
|
||||
*/
|
||||
private class MapDataIterator implements Iterator<MapData> {
|
||||
final LocusView locusView;
|
||||
final LocusReferenceView referenceView;
|
||||
final ReferenceOrderedView referenceOrderedDataView;
|
||||
int numIterations = 0;
|
||||
|
||||
private MapDataIterator(LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView) {
|
||||
this.locusView = locusView;
|
||||
this.referenceView = referenceView;
|
||||
this.referenceOrderedDataView = referenceOrderedDataView;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return locusView.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public MapData next() {
|
||||
final AlignmentContext locus = locusView.next();
|
||||
final GenomeLoc location = locus.getLocation();
|
||||
|
||||
//logger.info("Pulling data from MapDataIterator at " + location);
|
||||
|
||||
// create reference context. Note that if we have a pileup of "extended events", the context will
|
||||
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
|
||||
final ReferenceContext refContext = referenceView.getReferenceContext(location);
|
||||
|
||||
// Iterate forward to get all reference ordered data covering this location
|
||||
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(location, refContext);
|
||||
|
||||
numIterations++;
|
||||
return new MapData(locus, refContext, tracker);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Cannot remove elements from MapDataIterator");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void printOnTraversalDone() {
|
||||
nanoScheduler.shutdown();
|
||||
super.printOnTraversalDone();
|
||||
}
|
||||
|
||||
/**
|
||||
* The input data needed for each map call. The read, the reference, and the RODs
|
||||
*/
|
||||
private class MapData {
|
||||
final AlignmentContext alignmentContext;
|
||||
final ReferenceContext refContext;
|
||||
final RefMetaDataTracker tracker;
|
||||
|
||||
private MapData(final AlignmentContext alignmentContext, ReferenceContext refContext, RefMetaDataTracker tracker) {
|
||||
this.alignmentContext = alignmentContext;
|
||||
this.refContext = refContext;
|
||||
this.tracker = tracker;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "MapData " + alignmentContext.getLocation();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Contains the results of a map call, indicating whether the call was good, filtered, or done
|
||||
*/
|
||||
private class MapResult {
|
||||
final M value;
|
||||
final boolean reduceMe;
|
||||
|
||||
/**
|
||||
* Create a MapResult with value that should be reduced
|
||||
*
|
||||
* @param value the value to reduce
|
||||
*/
|
||||
private MapResult(final M value) {
|
||||
this.value = value;
|
||||
this.reduceMe = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a MapResult that shouldn't be reduced
|
||||
*/
|
||||
private MapResult() {
|
||||
this.value = null;
|
||||
this.reduceMe = false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A static object that tells reduce that the result of map should be skipped (filtered or done)
|
||||
*/
|
||||
private final MapResult SKIP_REDUCE = new MapResult();
|
||||
|
||||
/**
|
||||
* MapFunction for TraverseReads meeting NanoScheduler interface requirements
|
||||
*
|
||||
* Applies walker.map to MapData, returning a MapResult object containing the result
|
||||
*/
|
||||
private class TraverseLociMap implements NSMapFunction<MapData, MapResult> {
|
||||
final LocusWalker<M,T> walker;
|
||||
|
||||
private TraverseLociMap(LocusWalker<M, T> walker) {
|
||||
this.walker = walker;
|
||||
}
|
||||
|
||||
@Override
|
||||
public MapResult apply(final MapData data) {
|
||||
if ( ! walker.isDone() ) {
|
||||
final boolean keepMeP = walker.filter(data.tracker, data.refContext, data.alignmentContext);
|
||||
if (keepMeP) {
|
||||
final M x = walker.map(data.tracker, data.refContext, data.alignmentContext);
|
||||
return new MapResult(x);
|
||||
}
|
||||
}
|
||||
return SKIP_REDUCE;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements
|
||||
*
|
||||
* Takes a MapResult object and applies the walkers reduce function to each map result, when applicable
|
||||
*/
|
||||
private class TraverseLociReduce implements NSReduceFunction<MapResult, T> {
|
||||
final LocusWalker<M,T> walker;
|
||||
|
||||
private TraverseLociReduce(LocusWalker<M, T> walker) {
|
||||
this.walker = walker;
|
||||
}
|
||||
|
||||
@Override
|
||||
public T apply(MapResult one, T sum) {
|
||||
if ( one.reduceMe )
|
||||
// only run reduce on values that aren't DONE or FAILED
|
||||
return walker.reduce(one.value, sum);
|
||||
else
|
||||
return sum;
|
||||
}
|
||||
}
|
||||
|
||||
private class TraverseLociProgress implements NSProgressFunction<MapData> {
|
||||
@Override
|
||||
public void progress(MapData lastProcessedMap) {
|
||||
if (lastProcessedMap.alignmentContext != null)
|
||||
printProgress(lastProcessedMap.alignmentContext.getLocation());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -65,7 +65,8 @@ public class TraverseReadPairs<M,T> extends TraversalEngine<M,T, ReadPairWalker<
|
|||
pairs.clear();
|
||||
pairs.add(read);
|
||||
|
||||
printProgress(dataProvider.getShard(),null);
|
||||
updateCumulativeMetrics(dataProvider.getShard());
|
||||
printProgress(null);
|
||||
}
|
||||
|
||||
done = walker.isDone();
|
||||
|
|
|
|||
|
|
@ -1,20 +1,3 @@
|
|||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.ReadMetrics;
|
||||
import org.broadinstitute.sting.gatk.WalkerManager;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadBasedReferenceOrderedView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadReferenceView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadView;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
|
|
@ -39,6 +22,19 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
|||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadBasedReferenceOrderedView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadReferenceView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadView;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
|
|
@ -75,29 +71,27 @@ public class TraverseReads<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,Read
|
|||
if( !dataProvider.hasReads() )
|
||||
throw new IllegalArgumentException("Unable to traverse reads; no read data is available.");
|
||||
|
||||
ReadView reads = new ReadView(dataProvider);
|
||||
ReadReferenceView reference = new ReadReferenceView(dataProvider);
|
||||
final ReadView reads = new ReadView(dataProvider);
|
||||
final ReadReferenceView reference = new ReadReferenceView(dataProvider);
|
||||
|
||||
// get the reference ordered data
|
||||
ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider);
|
||||
final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider);
|
||||
|
||||
boolean done = walker.isDone();
|
||||
// while we still have more reads
|
||||
for (SAMRecord read : reads) {
|
||||
for (final SAMRecord read : reads) {
|
||||
if ( done ) break;
|
||||
// ReferenceContext -- the reference bases covered by the read
|
||||
ReferenceContext refContext = null;
|
||||
|
||||
// get the array of characters for the reference sequence, since we're a mapped read
|
||||
if (!read.getReadUnmappedFlag() && dataProvider.hasReference())
|
||||
refContext = reference.getReferenceContext(read);
|
||||
// ReferenceContext -- the reference bases covered by the read
|
||||
final ReferenceContext refContext = ! read.getReadUnmappedFlag() && dataProvider.hasReference()
|
||||
? reference.getReferenceContext(read)
|
||||
: null;
|
||||
|
||||
// update the number of reads we've seen
|
||||
ReadMetrics readMetrics = dataProvider.getShard().getReadMetrics();
|
||||
readMetrics.incrementNumIterations();
|
||||
dataProvider.getShard().getReadMetrics().incrementNumIterations();
|
||||
|
||||
// if the read is mapped, create a metadata tracker
|
||||
ReadMetaDataTracker tracker = (read.getReferenceIndex() >= 0) ? rodView.getReferenceOrderedDataForRead(read) : null;
|
||||
final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 ? rodView.getReferenceOrderedDataForRead(read) : null;
|
||||
|
||||
final boolean keepMeP = walker.filter(refContext, (GATKSAMRecord) read);
|
||||
if (keepMeP) {
|
||||
|
|
@ -105,8 +99,11 @@ public class TraverseReads<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,Read
|
|||
sum = walker.reduce(x, sum);
|
||||
}
|
||||
|
||||
GenomeLoc locus = read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ? null : engine.getGenomeLocParser().createGenomeLoc(read.getReferenceName(),read.getAlignmentStart());
|
||||
printProgress(dataProvider.getShard(),locus);
|
||||
final GenomeLoc locus = read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ? null : engine.getGenomeLocParser().createGenomeLoc(read.getReferenceName(),read.getAlignmentStart());
|
||||
|
||||
updateCumulativeMetrics(dataProvider.getShard());
|
||||
printProgress(locus);
|
||||
|
||||
done = walker.isDone();
|
||||
}
|
||||
return sum;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,234 @@
|
|||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadBasedReferenceOrderedView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadReferenceView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadView;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.ReadShard;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.nanoScheduler.NSMapFunction;
|
||||
import org.broadinstitute.sting.utils.nanoScheduler.NSReduceFunction;
|
||||
import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A nano-scheduling version of TraverseReads.
|
||||
*
|
||||
* Implements the traversal of a walker that accepts individual reads, the reference, and
|
||||
* RODs per map call. Directly supports shared memory parallelism via NanoScheduler
|
||||
*
|
||||
* @author depristo
|
||||
* @version 1.0
|
||||
* @date 9/2/2012
|
||||
*/
|
||||
public class TraverseReadsNano<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,ReadShardDataProvider> {
|
||||
/** our log, which we want to capture anything from this class */
|
||||
protected static final Logger logger = Logger.getLogger(TraverseReadsNano.class);
|
||||
private static final boolean DEBUG = false;
|
||||
final NanoScheduler<MapData, MapResult, T> nanoScheduler;
|
||||
|
||||
public TraverseReadsNano(int nThreads) {
|
||||
final int bufferSize = ReadShard.getReadBufferSize() + 1; // actually has 1 more than max
|
||||
nanoScheduler = new NanoScheduler<MapData, MapResult, T>(bufferSize, nThreads);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getTraversalType() {
|
||||
return "reads";
|
||||
}
|
||||
|
||||
/**
|
||||
* Traverse by reads, given the data and the walker
|
||||
*
|
||||
* @param walker the walker to traverse with
|
||||
* @param dataProvider the provider of the reads data
|
||||
* @param sum the value of type T, specified by the walker, to feed to the walkers reduce function
|
||||
* @return the reduce variable of the read walker
|
||||
*/
|
||||
public T traverse(ReadWalker<M,T> walker,
|
||||
ReadShardDataProvider dataProvider,
|
||||
T sum) {
|
||||
logger.debug(String.format("TraverseReadsNano.traverse Covered dataset is %s", dataProvider));
|
||||
|
||||
if( !dataProvider.hasReads() )
|
||||
throw new IllegalArgumentException("Unable to traverse reads; no read data is available.");
|
||||
|
||||
nanoScheduler.setDebug(DEBUG);
|
||||
final TraverseReadsMap myMap = new TraverseReadsMap(walker);
|
||||
final TraverseReadsReduce myReduce = new TraverseReadsReduce(walker);
|
||||
|
||||
final List<MapData> aggregatedInputs = aggregateMapData(dataProvider);
|
||||
final T result = nanoScheduler.execute(aggregatedInputs.iterator(), myMap, sum, myReduce);
|
||||
|
||||
final GATKSAMRecord lastRead = aggregatedInputs.get(aggregatedInputs.size() - 1).read;
|
||||
final GenomeLoc locus = engine.getGenomeLocParser().createGenomeLoc(lastRead);
|
||||
|
||||
updateCumulativeMetrics(dataProvider.getShard());
|
||||
printProgress(locus);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Aggregate all of the inputs for all map calls into MapData, to be provided
|
||||
* to NanoScheduler for Map/Reduce
|
||||
*
|
||||
* @param dataProvider the source of our data
|
||||
* @return a linked list of MapData objects holding the read, ref, and ROD info for every map/reduce
|
||||
* should execute
|
||||
*/
|
||||
private List<MapData> aggregateMapData(final ReadShardDataProvider dataProvider) {
|
||||
final ReadView reads = new ReadView(dataProvider);
|
||||
final ReadReferenceView reference = new ReadReferenceView(dataProvider);
|
||||
final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider);
|
||||
|
||||
final List<MapData> mapData = new LinkedList<MapData>();
|
||||
for ( final SAMRecord read : reads ) {
|
||||
final ReferenceContext refContext = ! read.getReadUnmappedFlag()
|
||||
? reference.getReferenceContext(read)
|
||||
: null;
|
||||
|
||||
// if the read is mapped, create a metadata tracker
|
||||
final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0
|
||||
? rodView.getReferenceOrderedDataForRead(read)
|
||||
: null;
|
||||
|
||||
// update the number of reads we've seen
|
||||
dataProvider.getShard().getReadMetrics().incrementNumIterations();
|
||||
|
||||
mapData.add(new MapData((GATKSAMRecord)read, refContext, tracker));
|
||||
}
|
||||
|
||||
return mapData;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void printOnTraversalDone() {
|
||||
nanoScheduler.shutdown();
|
||||
super.printOnTraversalDone();
|
||||
}
|
||||
|
||||
/**
|
||||
* The input data needed for each map call. The read, the reference, and the RODs
|
||||
*/
|
||||
private class MapData {
|
||||
final GATKSAMRecord read;
|
||||
final ReferenceContext refContext;
|
||||
final RefMetaDataTracker tracker;
|
||||
|
||||
private MapData(GATKSAMRecord read, ReferenceContext refContext, RefMetaDataTracker tracker) {
|
||||
this.read = read;
|
||||
this.refContext = refContext;
|
||||
this.tracker = tracker;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Contains the results of a map call, indicating whether the call was good, filtered, or done
|
||||
*/
|
||||
private class MapResult {
|
||||
final M value;
|
||||
final boolean reduceMe;
|
||||
|
||||
/**
|
||||
* Create a MapResult with value that should be reduced
|
||||
*
|
||||
* @param value the value to reduce
|
||||
*/
|
||||
private MapResult(final M value) {
|
||||
this.value = value;
|
||||
this.reduceMe = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a MapResult that shouldn't be reduced
|
||||
*/
|
||||
private MapResult() {
|
||||
this.value = null;
|
||||
this.reduceMe = false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A static object that tells reduce that the result of map should be skipped (filtered or done)
|
||||
*/
|
||||
private final MapResult SKIP_REDUCE = new MapResult();
|
||||
|
||||
/**
|
||||
* MapFunction for TraverseReads meeting NanoScheduler interface requirements
|
||||
*
|
||||
* Applies walker.map to MapData, returning a MapResult object containing the result
|
||||
*/
|
||||
private class TraverseReadsMap implements NSMapFunction<MapData, MapResult> {
|
||||
final ReadWalker<M,T> walker;
|
||||
|
||||
private TraverseReadsMap(ReadWalker<M, T> walker) {
|
||||
this.walker = walker;
|
||||
}
|
||||
|
||||
@Override
|
||||
public MapResult apply(final MapData data) {
|
||||
if ( ! walker.isDone() ) {
|
||||
final boolean keepMeP = walker.filter(data.refContext, data.read);
|
||||
if (keepMeP)
|
||||
return new MapResult(walker.map(data.refContext, data.read, data.tracker));
|
||||
}
|
||||
|
||||
return SKIP_REDUCE;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements
|
||||
*
|
||||
* Takes a MapResult object and applies the walkers reduce function to each map result, when applicable
|
||||
*/
|
||||
private class TraverseReadsReduce implements NSReduceFunction<MapResult, T> {
|
||||
final ReadWalker<M,T> walker;
|
||||
|
||||
private TraverseReadsReduce(ReadWalker<M, T> walker) {
|
||||
this.walker = walker;
|
||||
}
|
||||
|
||||
@Override
|
||||
public T apply(MapResult one, T sum) {
|
||||
if ( one.reduceMe )
|
||||
// only run reduce on values that aren't DONE or FAILED
|
||||
return walker.reduce(one.value, sum);
|
||||
else
|
||||
return sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -12,6 +12,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
|
||||
|
|
@ -77,7 +78,7 @@ public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<Map
|
|||
public abstract ActivityProfileResult isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context);
|
||||
|
||||
// Map over the ActiveRegion
|
||||
public abstract MapType map(final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker);
|
||||
public abstract MapType map(final ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker);
|
||||
|
||||
public final GenomeLocSortedSet extendIntervals( final GenomeLocSortedSet intervals, final GenomeLocParser genomeLocParser, IndexedFastaSequenceFile reference ) {
|
||||
final int activeRegionExtension = this.getClass().getAnnotation(ActiveRegionExtension.class).extension();
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
|
||||
import java.lang.annotation.*;
|
||||
|
||||
/**
|
||||
|
|
@ -25,5 +27,5 @@ import java.lang.annotation.*;
|
|||
@Target(ElementType.TYPE)
|
||||
public @interface BAQMode {
|
||||
public abstract org.broadinstitute.sting.utils.baq.BAQ.QualityMode QualityMode() default org.broadinstitute.sting.utils.baq.BAQ.QualityMode.OVERWRITE_QUALS;
|
||||
public abstract org.broadinstitute.sting.utils.baq.BAQ.ApplicationTime ApplicationTime() default org.broadinstitute.sting.utils.baq.BAQ.ApplicationTime.ON_INPUT;
|
||||
public abstract ReadTransformer.ApplicationTime ApplicationTime() default ReadTransformer.ApplicationTime.ON_INPUT;
|
||||
}
|
||||
|
|
@ -36,7 +36,7 @@ import org.broadinstitute.sting.commandline.Output;
|
|||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.clipping.ClippingOp;
|
||||
|
|
@ -299,7 +299,7 @@ public class ClipReads extends ReadWalker<ClipReads.ReadClipperWithData, ClipRea
|
|||
* @param read the read itself, as a GATKSAMRecord
|
||||
* @return the ReadClipper object describing what should be done to clip this read
|
||||
*/
|
||||
public ReadClipperWithData map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
public ReadClipperWithData map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) {
|
||||
if ( onlyDoRead == null || read.getReadName().equals(onlyDoRead) ) {
|
||||
if ( clippingRepresentation == ClippingRepresentation.HARDCLIP_BASES )
|
||||
read = ReadClipper.revertSoftClippedBases(read);
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
package org.broadinstitute.sting.gatk.walkers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
|
||||
import java.lang.annotation.*;
|
||||
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ package org.broadinstitute.sting.gatk.walkers;
|
|||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
|
|
@ -45,12 +45,12 @@ import java.text.NumberFormat;
|
|||
*/
|
||||
@DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} )
|
||||
@Requires({DataSource.READS})
|
||||
public class FlagStat extends ReadWalker<Integer, Integer> {
|
||||
public class FlagStat extends ReadWalker<FlagStat.FlagStatus, FlagStat.FlagStatus> implements NanoSchedulable {
|
||||
@Output
|
||||
PrintStream out;
|
||||
|
||||
// what comes out of the flagstat
|
||||
static class FlagStatus {
|
||||
public final static class FlagStatus {
|
||||
long readCount = 0L;
|
||||
long QC_failure = 0L;
|
||||
long duplicates = 0L;
|
||||
|
|
@ -117,62 +117,84 @@ public class FlagStat extends ReadWalker<Integer, Integer> {
|
|||
return builder.toString();
|
||||
}
|
||||
|
||||
}
|
||||
public FlagStatus add(final FlagStatus other) {
|
||||
readCount += other.readCount;
|
||||
QC_failure += other.QC_failure;
|
||||
duplicates += other.duplicates;
|
||||
mapped += other.mapped;
|
||||
paired_in_sequencing += other.paired_in_sequencing;
|
||||
read1 += other.read1;
|
||||
read2 += other.read2;
|
||||
properly_paired += other.properly_paired;
|
||||
with_itself_and_mate_mapped += other.with_itself_and_mate_mapped;
|
||||
singletons += other.singletons;
|
||||
with_mate_mapped_to_a_different_chr += other.with_mate_mapped_to_a_different_chr;
|
||||
with_mate_mapped_to_a_different_chr_maq_greaterequal_than_5 += other.with_mate_mapped_to_a_different_chr_maq_greaterequal_than_5;
|
||||
|
||||
|
||||
private FlagStatus myStat = new FlagStatus();
|
||||
|
||||
public Integer map( ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) {
|
||||
myStat.readCount++;
|
||||
if (read.getReadFailsVendorQualityCheckFlag()) {
|
||||
myStat.QC_failure++;
|
||||
return this;
|
||||
}
|
||||
if (read.getDuplicateReadFlag()) {
|
||||
myStat.duplicates++;
|
||||
}
|
||||
if (!read.getReadUnmappedFlag()) {
|
||||
myStat.mapped++;
|
||||
}
|
||||
if (read.getReadPairedFlag()) {
|
||||
myStat.paired_in_sequencing++;
|
||||
|
||||
if (read.getSecondOfPairFlag()) {
|
||||
myStat.read2++;
|
||||
} else if (read.getReadPairedFlag()) {
|
||||
myStat.read1++;
|
||||
public FlagStatus add(final GATKSAMRecord read) {
|
||||
this.readCount++;
|
||||
|
||||
if (read.getReadFailsVendorQualityCheckFlag()) {
|
||||
this.QC_failure++;
|
||||
}
|
||||
if (read.getProperPairFlag()) {
|
||||
myStat.properly_paired++;
|
||||
if (read.getDuplicateReadFlag()) {
|
||||
this.duplicates++;
|
||||
}
|
||||
if (!read.getReadUnmappedFlag() && !read.getMateUnmappedFlag()) {
|
||||
myStat.with_itself_and_mate_mapped++;
|
||||
if (!read.getReadUnmappedFlag()) {
|
||||
this.mapped++;
|
||||
}
|
||||
if (read.getReadPairedFlag()) {
|
||||
this.paired_in_sequencing++;
|
||||
|
||||
if (!read.getReferenceIndex().equals(read.getMateReferenceIndex())) {
|
||||
myStat.with_mate_mapped_to_a_different_chr++;
|
||||
if (read.getSecondOfPairFlag()) {
|
||||
this.read2++;
|
||||
} else if (read.getReadPairedFlag()) {
|
||||
this.read1++;
|
||||
}
|
||||
if (read.getProperPairFlag()) {
|
||||
this.properly_paired++;
|
||||
}
|
||||
if (!read.getReadUnmappedFlag() && !read.getMateUnmappedFlag()) {
|
||||
this.with_itself_and_mate_mapped++;
|
||||
|
||||
if (read.getMappingQuality() >= 5) {
|
||||
myStat.with_mate_mapped_to_a_different_chr_maq_greaterequal_than_5++;
|
||||
if (!read.getReferenceIndex().equals(read.getMateReferenceIndex())) {
|
||||
this.with_mate_mapped_to_a_different_chr++;
|
||||
|
||||
if (read.getMappingQuality() >= 5) {
|
||||
this.with_mate_mapped_to_a_different_chr_maq_greaterequal_than_5++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!read.getReadUnmappedFlag() && read.getMateUnmappedFlag()) {
|
||||
this.singletons++;
|
||||
}
|
||||
}
|
||||
if (!read.getReadUnmappedFlag() && read.getMateUnmappedFlag()) {
|
||||
myStat.singletons++;
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
return 1;
|
||||
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
|
||||
@Override
|
||||
public FlagStatus map( final ReferenceContext ref, final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker ) {
|
||||
return new FlagStatus().add(read);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FlagStatus reduceInit() {
|
||||
return new FlagStatus();
|
||||
}
|
||||
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return value + sum;
|
||||
@Override
|
||||
public FlagStatus reduce(final FlagStatus value, final FlagStatus sum) {
|
||||
return sum.add(value);
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer result) {
|
||||
//out.println("[REDUCE RESULT] Traversal result is: " + result);
|
||||
out.println(myStat.toString());
|
||||
@Override
|
||||
public void onTraversalDone(final FlagStatus result) {
|
||||
out.println(result.toString());
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
/*
|
||||
* Copyright (c) 2010. The Broad Institute
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers;
|
||||
|
||||
/**
|
||||
* Root parallelism interface. Walkers that implement this
|
||||
* declare that their map function is thread-safe and so multiple
|
||||
* map calls can be run in parallel in the same JVM instance.
|
||||
*/
|
||||
public interface NanoSchedulable {
|
||||
}
|
||||
|
|
@ -45,25 +45,14 @@ import java.util.Collections;
|
|||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Prints the alignment in the pileup format. In the pileup format, each line represents a genomic position,
|
||||
* consisting of chromosome name, coordinate, reference base, read bases, read qualities and alignment mapping
|
||||
* qualities. Information on match, mismatch, indel, strand, mapping quality and start and end of a read are all
|
||||
* encoded at the read base column. At this column, a dot stands for a match to the reference base on the forward strand,
|
||||
* a comma for a match on the reverse strand, 'ACGTN' for a mismatch on the forward strand and 'acgtn' for a mismatch on the
|
||||
* reverse strand.
|
||||
*
|
||||
* A pattern '\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between this reference position and the next
|
||||
* reference position. The length of the insertion is given by the integer in the pattern, followed by the inserted sequence.
|
||||
* Similarly, a pattern '-[0-9]+[ACGTNacgtn]+' represents a deletion from the reference.
|
||||
* Also at the read base column, a symbol '^' marks the start of a read segment which is a contiguous subsequence on the read
|
||||
* separated by 'N/S/H' CIGAR operations. The ASCII of the character following '^' minus 33 gives the mapping quality.
|
||||
* A symbol '$' marks the end of a read segment.
|
||||
* Prints the alignment in something similar to the samtools pileup format. Each line represents a genomic position,
|
||||
* consisting of chromosome name, coordinate, reference base, read bases, and read qualities.
|
||||
*
|
||||
* Associated command:
|
||||
* samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list] [-iscg] [-T theta] [-N nHap] [-r pairDiffRate] <in.alignment>
|
||||
*/
|
||||
@DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} )
|
||||
public class Pileup extends LocusWalker<Integer, Integer> implements TreeReducible<Integer> {
|
||||
public class Pileup extends LocusWalker<String, Integer> implements TreeReducible<Integer>, NanoSchedulable {
|
||||
|
||||
private static final String verboseDelimiter = "@"; // it's ugly to use "@" but it's literally the only usable character not allowed in read names
|
||||
|
||||
|
|
@ -81,27 +70,32 @@ public class Pileup extends LocusWalker<Integer, Integer> implements TreeReducib
|
|||
@Input(fullName="metadata",shortName="metadata",doc="Add these ROD bindings to the output Pileup", required=false)
|
||||
public List<RodBinding<Feature>> rods = Collections.emptyList();
|
||||
|
||||
public void initialize() {
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
|
||||
String rods = getReferenceOrderedData( tracker );
|
||||
@Override
|
||||
public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
final String rods = getReferenceOrderedData( tracker );
|
||||
|
||||
ReadBackedPileup basePileup = context.getBasePileup();
|
||||
out.printf("%s %s", basePileup.getPileupString((char)ref.getBase()), rods);
|
||||
if ( SHOW_VERBOSE )
|
||||
out.printf(" %s", createVerboseOutput(basePileup));
|
||||
out.println();
|
||||
|
||||
return 1;
|
||||
final StringBuilder s = new StringBuilder();
|
||||
s.append(String.format("%s %s", basePileup.getPileupString((char)ref.getBase()), rods));
|
||||
if ( SHOW_VERBOSE )
|
||||
s.append(" ").append(createVerboseOutput(basePileup));
|
||||
s.append("\n");
|
||||
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
// Given result of map function
|
||||
@Override
|
||||
public Integer reduceInit() { return 0; }
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return treeReduce(sum,value);
|
||||
|
||||
@Override
|
||||
public Integer reduce(String value, Integer sum) {
|
||||
out.print(value);
|
||||
return sum + 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Integer treeReduce(Integer lhs, Integer rhs) {
|
||||
return lhs + rhs;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -32,17 +32,16 @@ import org.broadinstitute.sting.commandline.Output;
|
|||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Collection;
|
||||
import java.util.Random;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear in the input file.
|
||||
|
|
@ -91,9 +90,10 @@ import java.util.TreeSet;
|
|||
*
|
||||
*/
|
||||
@DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} )
|
||||
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
|
||||
@ReadTransformersMode(ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER)
|
||||
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER)
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||
public class PrintReads extends ReadWalker<GATKSAMRecord, SAMFileWriter> {
|
||||
public class PrintReads extends ReadWalker<GATKSAMRecord, SAMFileWriter> implements NanoSchedulable {
|
||||
|
||||
@Output(doc="Write output to this BAM filename instead of STDOUT", required = true)
|
||||
SAMFileWriter out;
|
||||
|
|
@ -138,6 +138,7 @@ public class PrintReads extends ReadWalker<GATKSAMRecord, SAMFileWriter> {
|
|||
public boolean simplifyReads = false;
|
||||
|
||||
|
||||
List<ReadTransformer> readTransformers = Collections.emptyList();
|
||||
private TreeSet<String> samplesToChoose = new TreeSet<String>();
|
||||
private boolean SAMPLES_SPECIFIED = false;
|
||||
|
||||
|
|
@ -150,6 +151,9 @@ public class PrintReads extends ReadWalker<GATKSAMRecord, SAMFileWriter> {
|
|||
if ( platform != null )
|
||||
platform = platform.toUpperCase();
|
||||
|
||||
if ( getToolkit() != null )
|
||||
readTransformers = getToolkit().getReadTransformers();
|
||||
|
||||
Collection<String> samplesFromFile;
|
||||
if (!sampleFile.isEmpty()) {
|
||||
samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFile);
|
||||
|
|
@ -217,11 +221,19 @@ public class PrintReads extends ReadWalker<GATKSAMRecord, SAMFileWriter> {
|
|||
* The reads map function.
|
||||
*
|
||||
* @param ref the reference bases that correspond to our read, if a reference was provided
|
||||
* @param read the read itself, as a GATKSAMRecord
|
||||
* @param readIn the read itself, as a GATKSAMRecord
|
||||
* @return the read itself
|
||||
*/
|
||||
public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) {
|
||||
return simplifyReads ? read.simplify() : read;
|
||||
public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord readIn, RefMetaDataTracker metaDataTracker ) {
|
||||
GATKSAMRecord workingRead = readIn;
|
||||
|
||||
for ( final ReadTransformer transformer : readTransformers ) {
|
||||
workingRead = transformer.apply(workingRead);
|
||||
}
|
||||
|
||||
if ( simplifyReads ) workingRead = workingRead.simplify();
|
||||
|
||||
return workingRead;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -245,5 +257,4 @@ public class PrintReads extends ReadWalker<GATKSAMRecord, SAMFileWriter> {
|
|||
output.addAlignment(read);
|
||||
return output;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,8 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/**
|
||||
|
|
@ -27,5 +26,5 @@ public abstract class ReadWalker<MapType, ReduceType> extends Walker<MapType, Re
|
|||
}
|
||||
|
||||
// Map over the org.broadinstitute.sting.gatk.contexts.AlignmentContext
|
||||
public abstract MapType map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker);
|
||||
public abstract MapType map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ import org.apache.log4j.Logger;
|
|||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
|
@ -64,7 +64,7 @@ public class SplitSamFile extends ReadWalker<SAMRecord, Map<String, SAMFileWrite
|
|||
logger.info("SplitSamFile version: " + VERSION);
|
||||
}
|
||||
|
||||
public SAMRecord map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
public SAMRecord map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) {
|
||||
return read;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -30,12 +30,14 @@ import org.apache.log4j.Logger;
|
|||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.filters.MalformedReadFilter;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.samples.Sample;
|
||||
import org.broadinstitute.sting.gatk.samples.SampleDB;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.recalibration.BQSRMode;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
|
|
@ -48,7 +50,8 @@ import java.util.List;
|
|||
*/
|
||||
@ReadFilters(MalformedReadFilter.class)
|
||||
@PartitionBy(PartitionType.NONE)
|
||||
@BAQMode(QualityMode = BAQ.QualityMode.OVERWRITE_QUALS, ApplicationTime = BAQ.ApplicationTime.ON_INPUT)
|
||||
@BAQMode(QualityMode = BAQ.QualityMode.OVERWRITE_QUALS, ApplicationTime = ReadTransformer.ApplicationTime.ON_INPUT)
|
||||
@BQSRMode(ApplicationTime = ReadTransformer.ApplicationTime.ON_INPUT)
|
||||
@DocumentedGATKFeature(groupName = "Uncategorized", extraDocs = {CommandLineGATK.class})
|
||||
public abstract class Walker<MapType, ReduceType> {
|
||||
final protected static Logger logger = Logger.getLogger(Walker.class);
|
||||
|
|
|
|||
|
|
@ -33,6 +33,9 @@ public class AlleleBalanceBySample extends GenotypeAnnotation implements Experim
|
|||
final Genotype g,
|
||||
final GenotypeBuilder gb,
|
||||
final PerReadAlleleLikelihoodMap alleleLikelihoodMap){
|
||||
if ( stratifiedContext == null )
|
||||
return;
|
||||
|
||||
Double ratio = annotateSNP(stratifiedContext, vc, g);
|
||||
if (ratio == null)
|
||||
return;
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ import java.util.*;
|
|||
|
||||
/**
|
||||
* The u-based z-approximation from the Mann-Whitney Rank Sum Test for base qualities (ref bases vs. bases of the alternate allele).
|
||||
* Note that the base quality rank sum test can not be calculated for homozygous sites.
|
||||
* Note that the base quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
|
||||
*/
|
||||
public class BaseQualityRankSumTest extends RankSumTest implements StandardAnnotation {
|
||||
public List<String> getKeyNames() { return Arrays.asList("BaseQRankSum"); }
|
||||
|
|
|
|||
|
|
@ -16,6 +16,10 @@ import java.util.*;
|
|||
* Date: 6/28/12
|
||||
*/
|
||||
|
||||
/**
|
||||
* The u-based z-approximation from the Mann-Whitney Rank Sum Test for reads with clipped bases (reads with ref bases vs. those with the alternate allele)
|
||||
* Note that the clipping rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
|
||||
*/
|
||||
public class ClippingRankSumTest extends RankSumTest {
|
||||
|
||||
public List<String> getKeyNames() { return Arrays.asList("ClippingRankSum"); }
|
||||
|
|
|
|||
|
|
@ -54,7 +54,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
|
|||
final Genotype g,
|
||||
final GenotypeBuilder gb,
|
||||
final PerReadAlleleLikelihoodMap alleleLikelihoodMap) {
|
||||
if ( g == null || !g.isCalled() )
|
||||
if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) )
|
||||
return;
|
||||
|
||||
if (alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty())
|
||||
|
|
@ -97,7 +97,6 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
|
|||
if (!vc.getAlleles().contains(a))
|
||||
continue; // sanity check - shouldn't be needed
|
||||
alleleCounts.put(a,alleleCounts.get(a)+1);
|
||||
|
||||
}
|
||||
final int[] counts = new int[alleleCounts.size()];
|
||||
counts[0] = alleleCounts.get(vc.getReference());
|
||||
|
|
|
|||
|
|
@ -32,13 +32,11 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa
|
|||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
|
@ -55,6 +53,8 @@ import java.util.*;
|
|||
public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
|
||||
private static final String FS = "FS";
|
||||
private static final double MIN_PVALUE = 1E-320;
|
||||
private static final int MIN_QUAL_FOR_FILTERED_TEST = 17;
|
||||
|
||||
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
|
||||
final AnnotatorCompatible walker,
|
||||
final ReferenceContext ref,
|
||||
|
|
@ -64,30 +64,53 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
if ( !vc.isVariant() )
|
||||
return null;
|
||||
|
||||
int[][] table;
|
||||
|
||||
if (vc.isSNP() && stratifiedContexts != null) {
|
||||
table = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount());
|
||||
final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), -1);
|
||||
final int[][] tableFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), MIN_QUAL_FOR_FILTERED_TEST);
|
||||
return pValueForBestTable(tableFiltering, tableNoFiltering);
|
||||
}
|
||||
else if (stratifiedPerReadAlleleLikelihoodMap != null) {
|
||||
// either SNP with no alignment context, or indels: per-read likelihood map needed
|
||||
table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount());
|
||||
final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount());
|
||||
return pValueForBestTable(table, null);
|
||||
}
|
||||
else
|
||||
// for non-snp variants, we need per-read likelihoods.
|
||||
// for snps, we can get same result from simple pileup
|
||||
// for non-snp variants, we need per-read likelihoods.
|
||||
// for snps, we can get same result from simple pileup
|
||||
return null;
|
||||
}
|
||||
|
||||
if (table == null)
|
||||
return null;
|
||||
/**
|
||||
* Create an annotation for the highest (i.e., least significant) p-value of table1 and table2
|
||||
*
|
||||
* @param table1 a contingency table, may be null
|
||||
* @param table2 a contingency table, may be null
|
||||
* @return annotation result for FS given tables
|
||||
*/
|
||||
private Map<String, Object> pValueForBestTable(final int[][] table1, final int[][] table2) {
|
||||
if ( table2 == null )
|
||||
return table1 == null ? null : annotationForOneTable(pValueForContingencyTable(table1));
|
||||
else if (table1 == null)
|
||||
return annotationForOneTable(pValueForContingencyTable(table2));
|
||||
else { // take the one with the best (i.e., least significant pvalue)
|
||||
double pvalue1 = Math.max(pValueForContingencyTable(table1), MIN_PVALUE);
|
||||
double pvalue2 = Math.max(pValueForContingencyTable(table2), MIN_PVALUE);
|
||||
return annotationForOneTable(Math.max(pvalue1, pvalue2));
|
||||
}
|
||||
}
|
||||
|
||||
Double pvalue = Math.max(pValueForContingencyTable(table), MIN_PVALUE);
|
||||
if ( pvalue == null )
|
||||
return null;
|
||||
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pvalue)));
|
||||
return map;
|
||||
/**
|
||||
* Returns an annotation result given a pValue
|
||||
*
|
||||
* @param pValue
|
||||
* @return a hash map from FS -> phred-scaled pValue
|
||||
*/
|
||||
private Map<String, Object> annotationForOneTable(final double pValue) {
|
||||
final Object value = String.format("%.3f", QualityUtils.phredScaleErrorRate(pValue));
|
||||
return Collections.singletonMap(FS, value);
|
||||
// Map<String, Object> map = new HashMap<String, Object>();
|
||||
// map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pValue)));
|
||||
// return map;
|
||||
}
|
||||
|
||||
public List<String> getKeyNames() {
|
||||
|
|
@ -244,7 +267,10 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
* allele2 # #
|
||||
* @return a 2x2 contingency table
|
||||
*/
|
||||
private static int[][] getSNPContingencyTable(Map<String, AlignmentContext> stratifiedContexts, Allele ref, Allele alt) {
|
||||
private static int[][] getSNPContingencyTable(final Map<String, AlignmentContext> stratifiedContexts,
|
||||
final Allele ref,
|
||||
final Allele alt,
|
||||
final int minQScoreToConsider ) {
|
||||
int[][] table = new int[2][2];
|
||||
|
||||
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
|
||||
|
|
@ -252,8 +278,11 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
if ( ! RankSumTest.isUsableBase(p, false) || p.getRead().isReducedRead() ) // ignore deletions and reduced reads
|
||||
continue;
|
||||
|
||||
Allele base = Allele.create(p.getBase(), false);
|
||||
boolean isFW = !p.getRead().getReadNegativeStrandFlag();
|
||||
if ( p.getQual() < minQScoreToConsider || p.getMappingQual() < minQScoreToConsider )
|
||||
continue;
|
||||
|
||||
final Allele base = Allele.create(p.getBase(), false);
|
||||
final boolean isFW = !p.getRead().getReadNegativeStrandFlag();
|
||||
|
||||
final boolean matchesRef = ref.equals(base, true);
|
||||
final boolean matchesAlt = alt.equals(base, true);
|
||||
|
|
@ -268,6 +297,4 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
|
||||
return table;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,14 +3,12 @@ package org.broadinstitute.sting.gatk.walkers.annotator;
|
|||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.samples.Sample;
|
||||
import org.broadinstitute.sting.gatk.samples.SampleDB;
|
||||
import org.broadinstitute.sting.gatk.samples.Trio;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.MendelianViolation;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||
|
|
@ -21,21 +19,17 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
* Date: 9/14/11
|
||||
* Time: 12:24 PM
|
||||
* Given a variant context, uses the genotype likelihoods to assess the likelihood of the site being a mendelian violation
|
||||
* versus the likelihood of the site transmitting according to mendelian rules. This assumes that the organism is
|
||||
* diploid. When multiple trios are present, the annotation is simply the maximum of the likelihood ratios, rather than
|
||||
* the strict 1-∏(1-p_i) calculation, as this can scale poorly for uncertain sites and many trios.
|
||||
*/
|
||||
|
||||
public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation {
|
||||
|
||||
private MendelianViolation mendelianViolation = null;
|
||||
public static final String MVLR_KEY = "MVLR";
|
||||
private Set<Trio> trios;
|
||||
private class Trio {
|
||||
String motherId;
|
||||
String fatherId;
|
||||
String childId;
|
||||
}
|
||||
|
||||
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
|
||||
final AnnotatorCompatible walker,
|
||||
|
|
@ -44,7 +38,8 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment
|
|||
final VariantContext vc,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) {
|
||||
if ( mendelianViolation == null ) {
|
||||
if (checkAndSetSamples(((Walker) walker).getSampleDB())) {
|
||||
trios = ((Walker) walker).getSampleDB().getTrios();
|
||||
if ( trios.size() > 0 ) {
|
||||
mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).minGenotypeQualityP );
|
||||
}
|
||||
else {
|
||||
|
|
@ -52,15 +47,12 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment
|
|||
}
|
||||
}
|
||||
|
||||
Map<String,Object> toRet = new HashMap<String,Object>(1);
|
||||
Map<String,Object> attributeMap = new HashMap<String,Object>(1);
|
||||
//double pNoMV = 1.0;
|
||||
double maxMVLR = Double.MIN_VALUE;
|
||||
for ( Trio trio : trios ) {
|
||||
boolean hasAppropriateGenotypes = vc.hasGenotype(trio.motherId) && vc.getGenotype(trio.motherId).hasLikelihoods() &&
|
||||
vc.hasGenotype(trio.fatherId) && vc.getGenotype(trio.fatherId).hasLikelihoods() &&
|
||||
vc.hasGenotype(trio.childId) && vc.getGenotype(trio.childId).hasLikelihoods();
|
||||
if ( hasAppropriateGenotypes ) {
|
||||
Double likR = mendelianViolation.violationLikelihoodRatio(vc,trio.motherId,trio.fatherId,trio.childId);
|
||||
if ( contextHasTrioLikelihoods(vc,trio) ) {
|
||||
Double likR = mendelianViolation.violationLikelihoodRatio(vc,trio.getMaternalID(),trio.getPaternalID(),trio.getChildID());
|
||||
maxMVLR = likR > maxMVLR ? likR : maxMVLR;
|
||||
//pNoMV *= (1.0-Math.pow(10.0,likR)/(1+Math.pow(10.0,likR)));
|
||||
}
|
||||
|
|
@ -68,34 +60,26 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment
|
|||
|
||||
//double pSomeMV = 1.0-pNoMV;
|
||||
//toRet.put("MVLR",Math.log10(pSomeMV)-Math.log10(1.0-pSomeMV));
|
||||
toRet.put("MVLR",maxMVLR);
|
||||
return toRet;
|
||||
if ( Double.compare(maxMVLR,Double.MIN_VALUE) != 0 )
|
||||
attributeMap.put(MVLR_KEY,maxMVLR);
|
||||
return attributeMap;
|
||||
}
|
||||
|
||||
// return the descriptions used for the VCF INFO meta field
|
||||
public List<String> getKeyNames() { return Arrays.asList("MVLR"); }
|
||||
public List<String> getKeyNames() { return Arrays.asList(MVLR_KEY); }
|
||||
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MVLR", 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); }
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(MVLR_KEY, 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); }
|
||||
|
||||
private boolean checkAndSetSamples(SampleDB db){
|
||||
trios = new HashSet<Trio>();
|
||||
Set<String> families = db.getFamilyIDs();
|
||||
for ( String familyString : families ) {
|
||||
Set<Sample> family = db.getFamily(familyString);
|
||||
Iterator<Sample> sampleIterator = family.iterator();
|
||||
Sample sample;
|
||||
for ( sample = sampleIterator.next(); sampleIterator.hasNext(); sample=sampleIterator.next()) {
|
||||
if ( sample.getParents().size() == 2 ) {
|
||||
Trio trio = new Trio();
|
||||
trio.childId = sample.getID();
|
||||
trio.fatherId = sample.getFather().getID();
|
||||
trio.motherId = sample.getMother().getID();
|
||||
trios.add(trio);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean contextHasTrioLikelihoods(VariantContext context, Trio trio) {
|
||||
for ( String sample : Arrays.asList(trio.getMaternalID(),trio.getPaternalID(),trio.getChildID()) ) {
|
||||
if ( ! context.hasGenotype(sample) )
|
||||
return false;
|
||||
if ( ! context.getGenotype(sample).hasLikelihoods() )
|
||||
return false;
|
||||
}
|
||||
|
||||
return trios.size() > 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ import java.util.*;
|
|||
|
||||
/**
|
||||
* The u-based z-approximation from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele)
|
||||
* Note that the mapping quality rank sum test can not be calculated for homozygous sites.
|
||||
* Note that the mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
|
||||
*/
|
||||
public class MappingQualityRankSumTest extends RankSumTest implements StandardAnnotation {
|
||||
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ public class MappingQualityZeroBySample extends GenotypeAnnotation {
|
|||
final Genotype g,
|
||||
final GenotypeBuilder gb,
|
||||
final PerReadAlleleLikelihoodMap alleleLikelihoodMap){
|
||||
if ( g == null || !g.isCalled() )
|
||||
if ( g == null || !g.isCalled() || stratifiedContext == null )
|
||||
return;
|
||||
|
||||
int mq0 = 0;
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ import java.util.*;
|
|||
|
||||
/**
|
||||
* The u-based z-approximation from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele; if the alternate allele is only seen near the ends of reads this is indicative of error).
|
||||
* Note that the read position rank sum test can not be calculated for homozygous sites.
|
||||
* Note that the read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
|
||||
*/
|
||||
public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotation {
|
||||
|
||||
|
|
|
|||
|
|
@ -34,7 +34,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
|
|||
import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
|
@ -218,7 +217,10 @@ public class VariantAnnotatorEngine {
|
|||
|
||||
// go through all the requested info annotationTypes
|
||||
for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) {
|
||||
Map<String, Object> annotationsFromCurrentType = ((ActiveRegionBasedAnnotation)annotationType).annotate(perReadAlleleLikelihoodMap, vc);
|
||||
if ( !(annotationType instanceof ActiveRegionBasedAnnotation) )
|
||||
continue;
|
||||
|
||||
Map<String, Object> annotationsFromCurrentType = annotationType.annotate(perReadAlleleLikelihoodMap, vc);
|
||||
if ( annotationsFromCurrentType != null ) {
|
||||
infoAnnotations.putAll(annotationsFromCurrentType);
|
||||
}
|
||||
|
|
@ -298,16 +300,12 @@ public class VariantAnnotatorEngine {
|
|||
if (stratifiedPerReadAlleleLikelihoodMap != null)
|
||||
perReadAlleleLikelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName());
|
||||
|
||||
if ( context == null && perReadAlleleLikelihoodMap == null) {
|
||||
// no likelihoods nor pileup available: just move on to next sample
|
||||
genotypes.add(genotype);
|
||||
} else {
|
||||
final GenotypeBuilder gb = new GenotypeBuilder(genotype);
|
||||
for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) {
|
||||
annotation.annotate(tracker, walker, ref, context, vc, genotype, gb, perReadAlleleLikelihoodMap);
|
||||
}
|
||||
genotypes.add(gb.make());
|
||||
|
||||
final GenotypeBuilder gb = new GenotypeBuilder(genotype);
|
||||
for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) {
|
||||
annotation.annotate(tracker, walker, ref, context, vc, genotype, gb, perReadAlleleLikelihoodMap);
|
||||
}
|
||||
genotypes.add(gb.make());
|
||||
}
|
||||
|
||||
return genotypes;
|
||||
|
|
|
|||
|
|
@ -32,10 +32,9 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter;
|
||||
import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.classloader.GATKLiteUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
|
@ -46,6 +45,7 @@ import org.broadinstitute.sting.utils.recalibration.QuantizationInfo;
|
|||
import org.broadinstitute.sting.utils.recalibration.RecalUtils;
|
||||
import org.broadinstitute.sting.utils.recalibration.RecalibrationReport;
|
||||
import org.broadinstitute.sting.utils.recalibration.RecalibrationTables;
|
||||
import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
|
|
@ -56,7 +56,7 @@ import java.lang.reflect.Constructor;
|
|||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as reported quality score, cycle, and dinucleotide).
|
||||
* First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context).
|
||||
*
|
||||
* <p>
|
||||
* This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating
|
||||
|
|
@ -104,28 +104,28 @@ import java.util.ArrayList;
|
|||
*/
|
||||
|
||||
@DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} )
|
||||
@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN)
|
||||
@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN)
|
||||
@By(DataSource.READS)
|
||||
@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class}) // only look at covered loci, not every loci of the reference file
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE}) // filter out all reads with zero or unavailable mapping quality
|
||||
@PartitionBy(PartitionType.LOCUS) // this walker requires both -I input.bam and -R reference.fasta
|
||||
public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeReducible<Long> {
|
||||
@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class}) // only look at covered loci, not every loci of the reference file
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE}) // filter out all reads with zero or unavailable mapping quality
|
||||
@PartitionBy(PartitionType.LOCUS) // this walker requires both -I input.bam and -R reference.fasta
|
||||
public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeReducible<Long>, NanoSchedulable {
|
||||
@ArgumentCollection
|
||||
private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // all the command line arguments for BQSR and it's covariates
|
||||
private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // all the command line arguments for BQSR and it's covariates
|
||||
|
||||
private QuantizationInfo quantizationInfo; // an object that keeps track of the information necessary for quality score quantization
|
||||
private QuantizationInfo quantizationInfo; // an object that keeps track of the information necessary for quality score quantization
|
||||
|
||||
private RecalibrationTables recalibrationTables;
|
||||
|
||||
private Covariate[] requestedCovariates; // list to hold the all the covariate objects that were requested (required + standard + experimental)
|
||||
private Covariate[] requestedCovariates; // list to hold the all the covariate objects that were requested (required + standard + experimental)
|
||||
|
||||
private RecalibrationEngine recalibrationEngine;
|
||||
|
||||
private int minimumQToUse;
|
||||
|
||||
protected static final String SKIP_RECORD_ATTRIBUTE = "SKIP"; // used to label reads that should be skipped.
|
||||
protected static final String SEEN_ATTRIBUTE = "SEEN"; // used to label reads as processed.
|
||||
protected static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\
|
||||
protected static final String SKIP_RECORD_ATTRIBUTE = "SKIP"; // used to label reads that should be skipped.
|
||||
protected static final String SEEN_ATTRIBUTE = "SEEN"; // used to label reads as processed.
|
||||
protected static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\
|
||||
|
||||
private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation.";
|
||||
|
||||
|
|
@ -143,16 +143,16 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeRed
|
|||
if (RAC.FORCE_PLATFORM != null)
|
||||
RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM;
|
||||
|
||||
if (RAC.knownSites.isEmpty() && !RAC.RUN_WITHOUT_DBSNP) // Warn the user if no dbSNP file or other variant mask was specified
|
||||
if (RAC.knownSites.isEmpty() && !RAC.RUN_WITHOUT_DBSNP) // Warn the user if no dbSNP file or other variant mask was specified
|
||||
throw new UserException.CommandLineException(NO_DBSNP_EXCEPTION);
|
||||
|
||||
if (RAC.LIST_ONLY) {
|
||||
RecalUtils.listAvailableCovariates(logger);
|
||||
System.exit(0);
|
||||
}
|
||||
RAC.recalibrationReport = getToolkit().getArguments().BQSR_RECAL_FILE; // if we have a recalibration file, record it so it goes on the report table
|
||||
RAC.recalibrationReport = getToolkit().getArguments().BQSR_RECAL_FILE; // if we have a recalibration file, record it so it goes on the report table
|
||||
|
||||
Pair<ArrayList<Covariate>, ArrayList<Covariate>> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates
|
||||
Pair<ArrayList<Covariate>, ArrayList<Covariate>> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates
|
||||
ArrayList<Covariate> requiredCovariates = covariates.getFirst();
|
||||
ArrayList<Covariate> optionalCovariates = covariates.getSecond();
|
||||
|
||||
|
|
@ -164,9 +164,9 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeRed
|
|||
requestedCovariates[covariateIndex++] = covariate;
|
||||
|
||||
logger.info("The covariates being used here: ");
|
||||
for (Covariate cov : requestedCovariates) { // list all the covariates being used
|
||||
for (Covariate cov : requestedCovariates) { // list all the covariates being used
|
||||
logger.info("\t" + cov.getClass().getSimpleName());
|
||||
cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection
|
||||
cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection
|
||||
}
|
||||
|
||||
int numReadGroups = 0;
|
||||
|
|
@ -216,12 +216,14 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeRed
|
|||
*/
|
||||
public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
long countedSites = 0L;
|
||||
if (tracker.getValues(RAC.knownSites).size() == 0) { // Only analyze sites not present in the provided known sites
|
||||
// Only analyze sites not present in the provided known sites
|
||||
if (tracker.getValues(RAC.knownSites).size() == 0) {
|
||||
for (final PileupElement p : context.getBasePileup()) {
|
||||
final GATKSAMRecord read = p.getRead();
|
||||
final int offset = p.getOffset();
|
||||
|
||||
if (readHasBeenSkipped(read) || isLowQualityBase(read, offset)) // This read has been marked to be skipped or base is low quality (we don't recalibrate low quality bases)
|
||||
// This read has been marked to be skipped or base is low quality (we don't recalibrate low quality bases)
|
||||
if (readHasBeenSkipped(read) || isLowQualityBase(read, offset))
|
||||
continue;
|
||||
|
||||
if (readNotSeen(read)) {
|
||||
|
|
@ -234,10 +236,12 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeRed
|
|||
read.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalUtils.computeCovariates(read, requestedCovariates));
|
||||
}
|
||||
|
||||
if (!ReadUtils.isSOLiDRead(read) || // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
|
||||
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
|
||||
if (!ReadUtils.isSOLiDRead(read) ||
|
||||
RAC.SOLID_RECAL_MODE == RecalUtils.SOLID_RECAL_MODE.DO_NOTHING ||
|
||||
RecalUtils.isColorSpaceConsistent(read, offset))
|
||||
recalibrationEngine.updateDataForPileupElement(p, ref.getBase()); // This base finally passed all the checks for a good base, so add it to the big data hashmap
|
||||
// This base finally passed all the checks for a good base, so add it to the big data hashmap
|
||||
recalibrationEngine.updateDataForPileupElement(p, ref.getBase());
|
||||
}
|
||||
countedSites++;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
|
|||
import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.recalibration.RecalibrationTables;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
|
|
@ -34,4 +35,5 @@ public interface RecalibrationEngine {
|
|||
|
||||
public void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase);
|
||||
|
||||
public void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors);
|
||||
}
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue