Cleanup, document, and unit test ActiveRegion
-- All functions tested. In the testing / review I discovered several bugs in the ActiveRegion routines that manipulate reads. New version should be correct -- Enforce correct ordering of supporting states in constructor -- Enforce read ordering when adding reads to an active region in add -- Fix bug in HaplotypeCaller map with new updating read spans. Now get the full span before clipping down reads in map, so that variants are correctly placed w.r.t. the full reference sequence -- Encapsulate isActive field with an accessor function -- Make sure that all state lists are unmodifiable, and that the docs are clear about this -- ActiveRegion equalsExceptReads is for testing only, so make it package protected -- ActiveRegion.hardClipToRegion must resort reads as they can become out of order -- Previous version of HC clipped reads but, due to clipping, these reads could no longer overlap the active region. The old version of HC kept these reads, while the enforced contracts on the ActiveRegion detected this was a problem and those reads are removed. Has a minor impact on PLs and RankSumTest values -- Updating HaplotypeCaller MD5s to reflect changes to ActiveRegions read inclusion policy
This commit is contained in:
parent
3d9a83c759
commit
92c5635e19
|
|
@ -85,7 +85,7 @@ public class FindCoveredIntervals extends ActiveRegionWalker<GenomeLoc, Long> {
|
|||
|
||||
@Override
|
||||
public GenomeLoc map(final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion, final RefMetaDataTracker tracker) {
|
||||
if (activeRegion.isActive)
|
||||
if (activeRegion.isActive())
|
||||
return activeRegion.getLocation();
|
||||
else
|
||||
return null;
|
||||
|
|
|
|||
|
|
@ -463,15 +463,19 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
allelesToGenotype.removeAll( activeAllelesToGenotype );
|
||||
}
|
||||
|
||||
if( !activeRegion.isActive ) { return 0; } // Not active so nothing to do!
|
||||
if( !activeRegion.isActive()) { return 0; } // Not active so nothing to do!
|
||||
if( activeRegion.size() == 0 && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { return 0; } // No reads here so nothing to do!
|
||||
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && activeAllelesToGenotype.isEmpty() ) { return 0; } // No alleles found in this region so nothing to do!
|
||||
|
||||
finalizeActiveRegion( activeRegion ); // merge overlapping fragments, clip adapter and low qual tails
|
||||
|
||||
// note this operation must be performed before we clip the reads down, as this must correspond to the full reference region
|
||||
final GenomeLoc fullSpanBeforeClipping = getPaddedLoc(activeRegion);
|
||||
|
||||
final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true); // Create the reference haplotype which is the bases from the reference that make up the active region
|
||||
final byte[] fullReferenceWithPadding = activeRegion.getFullReference(referenceReader, REFERENCE_PADDING);
|
||||
//int PRUNE_FACTOR = Math.max(MIN_PRUNE_FACTOR, determinePruneFactorFromCoverage( activeRegion ));
|
||||
final ArrayList<Haplotype> haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, getPaddedLoc(activeRegion), MIN_PRUNE_FACTOR, activeAllelesToGenotype );
|
||||
final ArrayList<Haplotype> haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, fullSpanBeforeClipping, MIN_PRUNE_FACTOR, activeAllelesToGenotype );
|
||||
if( haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do!
|
||||
|
||||
activeRegion.hardClipToActiveRegion(); // only evaluate the parts of reads that are overlapping the active region
|
||||
|
|
@ -495,7 +499,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
stratifiedReadMap,
|
||||
perSampleFilteredReadList,
|
||||
fullReferenceWithPadding,
|
||||
getPaddedLoc(activeRegion),
|
||||
fullSpanBeforeClipping,
|
||||
activeRegion.getLocation(),
|
||||
getToolkit().getGenomeLocParser(),
|
||||
activeAllelesToGenotype ) ) {
|
||||
|
|
@ -505,9 +509,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
|
||||
if ( bamWriter != null ) {
|
||||
// write the haplotypes to the bam
|
||||
final GenomeLoc paddedRefLoc = getPaddedLoc(activeRegion);
|
||||
for ( Haplotype haplotype : haplotypes )
|
||||
writeHaplotype(haplotype, paddedRefLoc, bestHaplotypes.contains(haplotype));
|
||||
writeHaplotype(haplotype, fullSpanBeforeClipping, bestHaplotypes.contains(haplotype));
|
||||
|
||||
// we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently
|
||||
final Map<Allele, Haplotype> alleleToHaplotypeMap = new HashMap<Allele, Haplotype>(haplotypes.size());
|
||||
|
|
@ -519,7 +522,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
for ( Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) {
|
||||
final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue());
|
||||
if ( bestAllele != Allele.NO_CALL )
|
||||
writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele), paddedRefLoc.getStart());
|
||||
writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele), fullSpanBeforeClipping.getStart());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -559,7 +562,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
private void finalizeActiveRegion( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
|
||||
if( DEBUG ) { System.out.println("\nAssembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); }
|
||||
final ArrayList<GATKSAMRecord> finalizedReadList = new ArrayList<GATKSAMRecord>();
|
||||
final FragmentCollection<GATKSAMRecord> fragmentCollection = FragmentUtils.create( ReadUtils.sortReadsByCoordinate(activeRegion.getReads()) );
|
||||
final FragmentCollection<GATKSAMRecord> fragmentCollection = FragmentUtils.create( activeRegion.getReads() );
|
||||
activeRegion.clearReads();
|
||||
|
||||
// Join overlapping paired reads to create a single longer read
|
||||
|
|
@ -571,17 +574,20 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
Collections.shuffle(finalizedReadList, GenomeAnalysisEngine.getRandomGenerator());
|
||||
|
||||
// Loop through the reads hard clipping the adaptor and low quality tails
|
||||
final ArrayList<GATKSAMRecord> readsToUse = new ArrayList<GATKSAMRecord>(finalizedReadList.size());
|
||||
for( final GATKSAMRecord myRead : finalizedReadList ) {
|
||||
final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) );
|
||||
if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) {
|
||||
final GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY );
|
||||
// protect against INTERVALS with abnormally high coverage
|
||||
// BUGBUG: remove when positional downsampler is hooked up to ART/HC
|
||||
if( clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) {
|
||||
activeRegion.add(clippedRead);
|
||||
// TODO BUGBUG: remove when positional downsampler is hooked up to ART/HC
|
||||
if( activeRegion.readOverlapsRegion(clippedRead) &&
|
||||
clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) {
|
||||
readsToUse.add(clippedRead);
|
||||
}
|
||||
}
|
||||
}
|
||||
activeRegion.addAll(ReadUtils.sortReadsByCoordinate(readsToUse));
|
||||
}
|
||||
|
||||
private List<GATKSAMRecord> filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
|
||||
|
|
@ -596,9 +602,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
}
|
||||
|
||||
private GenomeLoc getPaddedLoc( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
|
||||
final int padLeft = Math.max(activeRegion.getReferenceLoc().getStart()-REFERENCE_PADDING, 1);
|
||||
final int padRight = Math.min(activeRegion.getReferenceLoc().getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(activeRegion.getReferenceLoc().getContig()).getSequenceLength());
|
||||
return getToolkit().getGenomeLocParser().createGenomeLoc(activeRegion.getReferenceLoc().getContig(), padLeft, padRight);
|
||||
final int padLeft = Math.max(activeRegion.getReadSpanLoc().getStart()-REFERENCE_PADDING, 1);
|
||||
final int padRight = Math.min(activeRegion.getReadSpanLoc().getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(activeRegion.getReadSpanLoc().getContig()).getSequenceLength());
|
||||
return getToolkit().getGenomeLocParser().createGenomeLoc(activeRegion.getReadSpanLoc().getContig(), padLeft, padRight);
|
||||
}
|
||||
|
||||
private HashMap<String, ArrayList<GATKSAMRecord>> splitReadsBySample( final List<GATKSAMRecord> reads ) {
|
||||
|
|
|
|||
|
|
@ -68,12 +68,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSample() {
|
||||
HCTest(CEUTRIO_BAM, "", "11290b619bc79b629cf29b8f428254ce");
|
||||
HCTest(CEUTRIO_BAM, "", "664a14590d0966e63d3aabff2d7bab0a");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSample() {
|
||||
HCTest(NA12878_BAM, "", "897abb2b4f98e9e460f373f9e0db5033");
|
||||
HCTest(NA12878_BAM, "", "111f3dc086a3cea1be9bd1ad6e1d18ed");
|
||||
}
|
||||
|
||||
@Test(enabled = false)
|
||||
|
|
@ -84,7 +84,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGA() {
|
||||
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
|
||||
"efc2cae94069a1d6ee5fdcc7afeaa0ed");
|
||||
"c70f753f7918a1c670ce4ed5c66de09e");
|
||||
}
|
||||
|
||||
private void HCTestComplexGGA(String bam, String args, String md5) {
|
||||
|
|
@ -96,13 +96,13 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGAComplex() {
|
||||
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538",
|
||||
"01f42c311fc3ce4f07ef86f8c01facfb");
|
||||
"b1d3070f0c49becf34101e480ab6c589");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
|
||||
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
|
||||
"4c117c84d1abeade1dee3f7b52a4a585");
|
||||
"20eba2e54266f6aebf35b7b7b7e754e3");
|
||||
}
|
||||
|
||||
private void HCTestComplexVariants(String bam, String args, String md5) {
|
||||
|
|
@ -113,7 +113,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSampleComplex() {
|
||||
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "939847eb7bbafc798916acffdb1b5697");
|
||||
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "f9805488d85e59e1ae002d0d32d7011d");
|
||||
}
|
||||
|
||||
private void HCTestSymbolicVariants(String bam, String args, String md5) {
|
||||
|
|
@ -124,7 +124,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSampleSymbolic() {
|
||||
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "25806874242973f00fb6f2a320ed4d9c");
|
||||
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "4544a2916f46f58b32db8008776b91a3");
|
||||
}
|
||||
|
||||
private void HCTestIndelQualityScores(String bam, String args, String md5) {
|
||||
|
|
@ -135,7 +135,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
|
||||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "c50b06d56cf3d0ef53e73a4973207949");
|
||||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "f3984a91e7562494c2a7e41fd05a6734");
|
||||
}
|
||||
|
||||
// That problem bam came from a user on the forum and it spotted a problem where the ReadClipper
|
||||
|
|
@ -146,14 +146,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void HCTestProblematicReadsModifiedInActiveRegions() {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ae2470e294d99ff2b825281b84730c72"));
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("3e9e025c539be6c5e0d0f2e5d8e86a62"));
|
||||
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void HCTestStructuralIndels() {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("6f18ae64bf466476d780a083dcb5fc43"));
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("34129e6c6310ef4eeeeb59b0e7ac0464"));
|
||||
executeTest("HCTestStructuralIndels: ", spec);
|
||||
}
|
||||
|
||||
|
|
@ -175,7 +175,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
public void HCTestReducedBam() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||
Arrays.asList("ecdb8e30ec5dd91efc179ab6732499f9"));
|
||||
Arrays.asList("5f4c07aaf1d2d34cccce43196a5fbd71"));
|
||||
executeTest("HC calling on a ReducedRead BAM", spec);
|
||||
}
|
||||
|
||||
|
|
@ -183,7 +183,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
public void testReducedBamWithReadsNotFullySpanningDeletion() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
|
||||
Arrays.asList("36a90309dde1a325c274388e302ffaa5"));
|
||||
Arrays.asList("6ead001b1f8e7cb433fd335f78fde5f0"));
|
||||
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -460,7 +460,7 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
|
|||
printIGVFormatRow(walker.activeRegionOutStream, region.getLocation().getStartLocation(),
|
||||
"end-marker", 0.0);
|
||||
printIGVFormatRow(walker.activeRegionOutStream, region.getLocation(),
|
||||
"size=" + region.getLocation().size(), region.isActive ? 1.0 : -1.0);
|
||||
"size=" + region.getLocation().size(), region.isActive() ? 1.0 : -1.0);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -541,12 +541,12 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
|
|||
}
|
||||
|
||||
if ( logger.isDebugEnabled() ) {
|
||||
logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc());
|
||||
logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive() ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReadSpanLoc());
|
||||
}
|
||||
|
||||
if ( LOG_READ_CARRYING )
|
||||
logger.info(String.format("Processing region %20s span=%3d active?=%5b with %4d reads. Overall max reads carried is %s",
|
||||
activeRegion.getLocation(), activeRegion.getLocation().size(), activeRegion.isActive, activeRegion.size(), maxReadsInMemory));
|
||||
activeRegion.getLocation(), activeRegion.getLocation().size(), activeRegion.isActive(), activeRegion.size(), maxReadsInMemory));
|
||||
|
||||
final M x = walker.map(activeRegion, null);
|
||||
return walker.reduce( x, sum );
|
||||
|
|
|
|||
|
|
@ -25,37 +25,105 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.activeregion;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Invariant;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.HasGenomeLocation;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* Represents a single active region created by the Active Region Traversal for processing
|
||||
*
|
||||
* An active region is a single contiguous span of bases on the genome that should be operated
|
||||
* on as a single unit for the active region traversal. The action may contains a list of
|
||||
* reads that overlap the region (may because there may be no reads in the region). The region
|
||||
* is tagged as being either active or inactive, depending on the probabilities provided by
|
||||
* the isActiveProb results from the ART walker. Each region carries with it the
|
||||
* exact span of the region (bases which are the core of the isActiveProbs from the walker) as
|
||||
* well as an extended size, that includes the ART walker's extension size. Reads in the region
|
||||
* provided by ART include all reads overlapping the extended span, not the raw span.
|
||||
*
|
||||
* User: rpoplin
|
||||
* Date: 1/4/12
|
||||
*/
|
||||
|
||||
@Invariant({
|
||||
"extension >= 0",
|
||||
"activeRegionLoc != null",
|
||||
"genomeLocParser != null",
|
||||
"spanIncludingReads != null",
|
||||
"extendedLoc != null"
|
||||
})
|
||||
public class ActiveRegion implements HasGenomeLocation {
|
||||
|
||||
/**
|
||||
* The reads included in this active region. May be empty upon creation, and expand / contract
|
||||
* as reads are added or removed from this region.
|
||||
*/
|
||||
private final ArrayList<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
|
||||
private final List<ActivityProfileState> supportingStates;
|
||||
private final GenomeLoc activeRegionLoc;
|
||||
private final GenomeLoc extendedLoc;
|
||||
private final int extension;
|
||||
private GenomeLoc fullExtentReferenceLoc = null;
|
||||
private final GenomeLocParser genomeLocParser;
|
||||
public final boolean isActive;
|
||||
|
||||
/**
|
||||
* An ordered list (by genomic coordinate) of the ActivityProfileStates that went
|
||||
* into this active region. May be empty, which says that no supporting states were
|
||||
* provided when this region was created.
|
||||
*/
|
||||
private final List<ActivityProfileState> supportingStates;
|
||||
|
||||
/**
|
||||
* The raw span of this active region, not including the active region extension
|
||||
*/
|
||||
private final GenomeLoc activeRegionLoc;
|
||||
|
||||
/**
|
||||
* The span of this active region on the genome, including the active region extension
|
||||
*/
|
||||
private final GenomeLoc extendedLoc;
|
||||
|
||||
/**
|
||||
* The extension, in bp, of this active region.
|
||||
*/
|
||||
private final int extension;
|
||||
|
||||
/**
|
||||
* A genomeLocParser so we can create genomeLocs
|
||||
*/
|
||||
private final GenomeLocParser genomeLocParser;
|
||||
|
||||
/**
|
||||
* Does this region represent an active region (all isActiveProbs above threshold) or
|
||||
* an inactive region (all isActiveProbs below threshold)?
|
||||
*/
|
||||
private final boolean isActive;
|
||||
|
||||
/**
|
||||
* The span of this active region, including the bp covered by all reads in this
|
||||
* region. This union of extensionLoc and the loc of all reads in this region.
|
||||
*
|
||||
* Must be at least as large as extendedLoc, but may be larger when reads
|
||||
* partially overlap this region.
|
||||
*/
|
||||
private GenomeLoc spanIncludingReads;
|
||||
|
||||
/**
|
||||
* Create a new ActiveRegion containing no reads
|
||||
*
|
||||
* @param activeRegionLoc the span of this active region
|
||||
* @param supportingStates the states that went into creating this region, or null / empty if none are available.
|
||||
* If not empty, must have exactly one state for each bp in activeRegionLoc
|
||||
* @param isActive indicates whether this is an active region, or an inactve one
|
||||
* @param genomeLocParser a non-null parser to let us create new genome locs
|
||||
* @param extension the active region extension to use for this active region
|
||||
*/
|
||||
public ActiveRegion( final GenomeLoc activeRegionLoc, final List<ActivityProfileState> supportingStates, final boolean isActive, final GenomeLocParser genomeLocParser, final int extension ) {
|
||||
if ( activeRegionLoc == null ) throw new IllegalArgumentException("activeRegionLoc cannot be null");
|
||||
if ( activeRegionLoc.size() == 0 ) throw new IllegalArgumentException("Active region cannot be of zero size, but got " + activeRegionLoc);
|
||||
if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null");
|
||||
if ( extension < 0 ) throw new IllegalArgumentException("extension cannot be < 0 but got " + extension);
|
||||
|
||||
|
|
@ -64,75 +132,254 @@ public class ActiveRegion implements HasGenomeLocation {
|
|||
this.isActive = isActive;
|
||||
this.genomeLocParser = genomeLocParser;
|
||||
this.extension = extension;
|
||||
extendedLoc = genomeLocParser.createGenomeLocOnContig(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension);
|
||||
fullExtentReferenceLoc = extendedLoc;
|
||||
this.extendedLoc = genomeLocParser.createGenomeLocOnContig(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension);
|
||||
this.spanIncludingReads = extendedLoc;
|
||||
|
||||
if ( ! this.supportingStates.isEmpty() ) {
|
||||
if ( this.supportingStates.size() != activeRegionLoc.size() )
|
||||
throw new IllegalArgumentException("Supporting states wasn't empty but it doesn't have exactly one state per bp in the active region: states " + this.supportingStates.size() + " vs. bp in region = " + activeRegionLoc.size());
|
||||
GenomeLoc lastStateLoc = null;
|
||||
for ( final ActivityProfileState state : this.supportingStates ) {
|
||||
if ( lastStateLoc != null ) {
|
||||
if ( state.getLoc().getStart() != lastStateLoc.getStart() + 1 || state.getLoc().getContigIndex() != lastStateLoc.getContigIndex())
|
||||
throw new IllegalArgumentException("Supporting state has an invalid sequence: last state was " + lastStateLoc + " but next state was " + state);
|
||||
}
|
||||
lastStateLoc = state.getLoc();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "ActiveRegion " + activeRegionLoc.toString() + " active?=" + isActive + " nReads=" + reads.size() + " ";
|
||||
return "ActiveRegion " + activeRegionLoc.toString() + " active?=" + isActive() + " nReads=" + reads.size() + " ";
|
||||
}
|
||||
|
||||
// add each read to the bin and extend the reference genome activeRegionLoc if needed
|
||||
public void add( final GATKSAMRecord read ) {
|
||||
fullExtentReferenceLoc = fullExtentReferenceLoc.union( genomeLocParser.createGenomeLoc( read ) );
|
||||
reads.add( read );
|
||||
}
|
||||
|
||||
public void hardClipToActiveRegion() {
|
||||
final ArrayList<GATKSAMRecord> clippedReads = ReadClipper.hardClipToRegion( reads, extendedLoc.getStart(), extendedLoc.getStop() );
|
||||
reads.clear();
|
||||
reads.addAll(clippedReads);
|
||||
}
|
||||
|
||||
public ArrayList<GATKSAMRecord> getReads() { return reads; }
|
||||
|
||||
@Requires("referenceReader.isUppercasingBases()")
|
||||
public byte[] getActiveRegionReference( final CachingIndexedFastaSequenceFile referenceReader ) {
|
||||
/**
|
||||
* See #getActiveRegionReference but with padding == 0
|
||||
*/
|
||||
public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader ) {
|
||||
return getActiveRegionReference(referenceReader, 0);
|
||||
}
|
||||
|
||||
@Requires("referenceReader.isUppercasingBases()")
|
||||
public byte[] getActiveRegionReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding ) {
|
||||
return getReference( referenceReader, padding, extendedLoc );
|
||||
/**
|
||||
* Get the reference bases from referenceReader spanned by the extended location of this active region,
|
||||
* including additional padding bp on either side. If this expanded region would exceed the boundaries
|
||||
* of the active region's contig, the returned result will be truncated to only include on-genome reference
|
||||
* bases
|
||||
* @param referenceReader the source of the reference genome bases
|
||||
* @param padding the padding, in BP, we want to add to either side of this active region extended region
|
||||
* @return a non-null array of bytes holding the reference bases in referenceReader
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader, final int padding ) {
|
||||
return getReference(referenceReader, padding, extendedLoc);
|
||||
}
|
||||
|
||||
@Requires("referenceReader.isUppercasingBases()")
|
||||
public byte[] getFullReference( final CachingIndexedFastaSequenceFile referenceReader ) {
|
||||
/**
|
||||
* See #getActiveRegionReference but using the span including regions not the extended span
|
||||
*/
|
||||
public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader ) {
|
||||
return getFullReference(referenceReader, 0);
|
||||
}
|
||||
|
||||
@Requires("referenceReader.isUppercasingBases()")
|
||||
public byte[] getFullReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding ) {
|
||||
return getReference( referenceReader, padding, fullExtentReferenceLoc );
|
||||
/**
|
||||
* See #getActiveRegionReference but using the span including regions not the extended span
|
||||
*/
|
||||
public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader, final int padding ) {
|
||||
return getReference(referenceReader, padding, spanIncludingReads);
|
||||
}
|
||||
|
||||
@Requires("referenceReader.isUppercasingBases()")
|
||||
private byte[] getReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) {
|
||||
/**
|
||||
* Get the reference bases from referenceReader spanned by the extended location of this active region,
|
||||
* including additional padding bp on either side. If this expanded region would exceed the boundaries
|
||||
* of the active region's contig, the returned result will be truncated to only include on-genome reference
|
||||
* bases
|
||||
* @param referenceReader the source of the reference genome bases
|
||||
* @param padding the padding, in BP, we want to add to either side of this active region extended region
|
||||
* @param genomeLoc a non-null genome loc indicating the base span of the bp we'd like to get the reference for
|
||||
* @return a non-null array of bytes holding the reference bases in referenceReader
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
private byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) {
|
||||
if ( referenceReader == null ) throw new IllegalArgumentException("referenceReader cannot be null");
|
||||
if ( padding < 0 ) throw new IllegalArgumentException("padding must be a positive integer but got " + padding);
|
||||
if ( genomeLoc == null ) throw new IllegalArgumentException("genomeLoc cannot be null");
|
||||
if ( genomeLoc.size() == 0 ) throw new IllegalArgumentException("GenomeLoc must have size > 0 but got " + genomeLoc);
|
||||
|
||||
final byte[] reference = referenceReader.getSubsequenceAt( genomeLoc.getContig(),
|
||||
Math.max(1, genomeLoc.getStart() - padding),
|
||||
Math.min(referenceReader.getSequenceDictionary().getSequence(genomeLoc.getContig()).getSequenceLength(), genomeLoc.getStop() + padding) ).getBases();
|
||||
|
||||
return reference;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the raw span of this active region (excluding the extension)
|
||||
* @return a non-null genome loc
|
||||
*/
|
||||
@Override
|
||||
@Ensures("result != null")
|
||||
public GenomeLoc getLocation() { return activeRegionLoc; }
|
||||
|
||||
/**
|
||||
* Get the span of this active region including the extension value
|
||||
* @return a non-null GenomeLoc
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public GenomeLoc getExtendedLoc() { return extendedLoc; }
|
||||
public GenomeLoc getReferenceLoc() { return fullExtentReferenceLoc; }
|
||||
|
||||
public List<ActivityProfileState> getSupportingStates() { return supportingStates; }
|
||||
/**
|
||||
* Get the span of this active region including the extension and the projects on the
|
||||
* genome of all reads in this active region. That is, returns the bp covered by this
|
||||
* region and all reads in the region.
|
||||
* @return a non-null genome loc
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public GenomeLoc getReadSpanLoc() { return spanIncludingReads; }
|
||||
|
||||
/**
|
||||
* Get the active profile states that went into creating this region, if possible
|
||||
* @return an unmodifiable list of states that led to the creation of this region, or an empty
|
||||
* list if none were provided
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public List<ActivityProfileState> getSupportingStates() {
|
||||
return Collections.unmodifiableList(supportingStates);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the active region extension applied to this region
|
||||
*
|
||||
* The extension is >= 0 bp in size, and indicates how much padding this art walker wanted for its regions
|
||||
*
|
||||
* @return the size in bp of the region extension
|
||||
*/
|
||||
@Ensures("result >= 0")
|
||||
public int getExtension() { return extension; }
|
||||
public int size() { return reads.size(); }
|
||||
public void clearReads() { reads.clear(); }
|
||||
public void removeAll( final ArrayList<GATKSAMRecord> readsToRemove ) { reads.removeAll( readsToRemove ); }
|
||||
|
||||
public boolean equalExceptReads(final ActiveRegion other) {
|
||||
/**
|
||||
* Get an unmodifiable list of reads currently in this active region.
|
||||
*
|
||||
* The reads are sorted by their coordinate position
|
||||
*
|
||||
* @return an unmodifiable list of reads in this active region
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public List<GATKSAMRecord> getReads() {
|
||||
return Collections.unmodifiableList(reads);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the number of reads currently in this active region
|
||||
* @return an integer >= 0
|
||||
*/
|
||||
@Ensures("result >= 0")
|
||||
public int size() { return reads.size(); }
|
||||
|
||||
/**
|
||||
* Add read to this active region
|
||||
*
|
||||
* Read must have alignment start >= than the last read currently in this active region.
|
||||
*
|
||||
* @throws IllegalArgumentException if read doesn't overlap the extended region of this active region
|
||||
*
|
||||
* @param read a non-null GATKSAMRecord
|
||||
*/
|
||||
@Ensures("reads.size() == old(reads.size()) + 1")
|
||||
public void add( final GATKSAMRecord read ) {
|
||||
if ( read == null ) throw new IllegalArgumentException("Read cannot be null");
|
||||
|
||||
final GenomeLoc readLoc = genomeLocParser.createGenomeLoc( read );
|
||||
if ( ! readOverlapsRegion(read) )
|
||||
throw new IllegalArgumentException("Read location " + readLoc + " doesn't overlap with active region extended span " + extendedLoc);
|
||||
|
||||
spanIncludingReads = spanIncludingReads.union( readLoc );
|
||||
|
||||
if ( ! reads.isEmpty() ) {
|
||||
final GATKSAMRecord lastRead = reads.get(size() - 1);
|
||||
if ( ! lastRead.getReferenceIndex().equals(read.getReferenceIndex()) )
|
||||
throw new IllegalArgumentException("Attempting to add a read to ActiveRegion not on the same contig as other reads: lastRead " + lastRead + " attempting to add " + read);
|
||||
|
||||
if ( read.getAlignmentStart() < lastRead.getAlignmentStart() )
|
||||
throw new IllegalArgumentException("Attempting to add a read to ActiveRegion out of order w.r.t. other reads: lastRead " + lastRead + " at " + lastRead.getAlignmentStart() + " attempting to add " + read + " at " + read.getAlignmentStart());
|
||||
}
|
||||
|
||||
reads.add( read );
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if read would overlap the extended extent of this region
|
||||
* @param read the read we want to test
|
||||
* @return true if read can be added to this region, false otherwise
|
||||
*/
|
||||
public boolean readOverlapsRegion(final GATKSAMRecord read) {
|
||||
final GenomeLoc readLoc = genomeLocParser.createGenomeLoc( read );
|
||||
return readLoc.overlapsP(extendedLoc);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add all reads to this active region
|
||||
* @param reads a collection of reads to add to this active region
|
||||
*/
|
||||
public void addAll(final Collection<GATKSAMRecord> reads) {
|
||||
if ( reads == null ) throw new IllegalArgumentException("reads cannot be null");
|
||||
for ( final GATKSAMRecord read : reads )
|
||||
add(read);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear all of the reads currently in this active region
|
||||
*/
|
||||
@Ensures("size() == 0")
|
||||
public void clearReads() {
|
||||
spanIncludingReads = extendedLoc;
|
||||
reads.clear();
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove all of the reads in readsToRemove from this active region
|
||||
* @param readsToRemove the collection of reads we want to remove
|
||||
*/
|
||||
public void removeAll( final Collection<GATKSAMRecord> readsToRemove ) {
|
||||
reads.removeAll(readsToRemove);
|
||||
spanIncludingReads = extendedLoc;
|
||||
for ( final GATKSAMRecord read : reads ) {
|
||||
spanIncludingReads = spanIncludingReads.union( genomeLocParser.createGenomeLoc(read) );
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Clips all of the reads in this active region so that none extend beyond the active region extended loc
|
||||
*
|
||||
* This function may change the getReadSpanLoc, as it updates the read span based on the new clipped
|
||||
* read coordinates.
|
||||
*/
|
||||
public void hardClipToActiveRegion() {
|
||||
final ArrayList<GATKSAMRecord> clippedReads = ReadClipper.hardClipToRegion( reads, extendedLoc.getStart(), extendedLoc.getStop() );
|
||||
ReadUtils.sortReadsByCoordinate(clippedReads);
|
||||
clearReads();
|
||||
addAll(clippedReads);
|
||||
}
|
||||
|
||||
/**
|
||||
* Is this region equal to other, excluding any reads in either region in the comparison
|
||||
* @param other the other active region we want to test
|
||||
* @return true if this region is equal, excluding any reads and derived values, to other
|
||||
*/
|
||||
protected boolean equalExceptReads(final ActiveRegion other) {
|
||||
if ( activeRegionLoc.compareTo(other.activeRegionLoc) != 0 ) return false;
|
||||
if ( isActive != other.isActive ) return false;
|
||||
if ( isActive() != other.isActive()) return false;
|
||||
if ( genomeLocParser != other.genomeLocParser ) return false;
|
||||
if ( extension != other.extension ) return false;
|
||||
if ( extendedLoc.compareTo(other.extendedLoc) != 0 ) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does this region represent an active region (all isActiveProbs above threshold) or
|
||||
* an inactive region (all isActiveProbs below threshold)?
|
||||
*/
|
||||
public boolean isActive() {
|
||||
return isActive;
|
||||
}
|
||||
}
|
||||
|
|
@ -115,6 +115,19 @@ public class ArtificialSAMUtils {
|
|||
return header;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an artificial sam header based on the sequence dictionary dict
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public static SAMFileHeader createArtificialSamHeader(final SAMSequenceDictionary dict) {
|
||||
SAMFileHeader header = new SAMFileHeader();
|
||||
header.setSortOrder(net.sf.samtools.SAMFileHeader.SortOrder.coordinate);
|
||||
header.setSequenceDictionary(dict);
|
||||
return header;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* setup a default read group for a SAMFileHeader
|
||||
*
|
||||
|
|
|
|||
|
|
@ -0,0 +1,223 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.activeregion;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
public class ActiveRegionUnitTest extends BaseTest {
|
||||
private GenomeLocParser genomeLocParser;
|
||||
private IndexedFastaSequenceFile seq;
|
||||
private String contig;
|
||||
private int contigLength;
|
||||
|
||||
@BeforeClass
|
||||
public void init() throws FileNotFoundException {
|
||||
// sequence
|
||||
seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
|
||||
genomeLocParser = new GenomeLocParser(seq);
|
||||
contig = "1";
|
||||
contigLength = genomeLocParser.getContigInfo(contig).getSequenceLength();
|
||||
}
|
||||
|
||||
@DataProvider(name = "ActionRegionCreationTest")
|
||||
public Object[][] makePollingData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
for ( final int start : Arrays.asList(1, 10, 100, contigLength - 10, contigLength - 1) ) {
|
||||
for ( final int size : Arrays.asList(1, 10, 100, 1000) ) {
|
||||
for ( final int ext : Arrays.asList(0, 1, 10, 100) ) {
|
||||
for ( final boolean isActive : Arrays.asList(true, false) ) {
|
||||
for ( final boolean addStates : Arrays.asList(true, false) ) {
|
||||
List<ActivityProfileState> states = null;
|
||||
if ( addStates ) {
|
||||
states = new LinkedList<ActivityProfileState>();
|
||||
for ( int i = start; i < start + size; i++ ) {
|
||||
states.add(new ActivityProfileState(genomeLocParser.createGenomeLoc(contig, i + start), isActive ? 1.0 : 0.0));
|
||||
}
|
||||
}
|
||||
final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, start, start + size - 1);
|
||||
tests.add(new Object[]{loc, states, isActive, ext});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "ActionRegionCreationTest")
|
||||
public void testCreatingActiveRegions(final GenomeLoc loc, final List<ActivityProfileState> supportingStates, final boolean isActive, final int extension) {
|
||||
final ActiveRegion region = new ActiveRegion(loc, supportingStates, isActive, genomeLocParser, extension);
|
||||
Assert.assertEquals(region.getLocation(), loc);
|
||||
Assert.assertEquals(region.getExtendedLoc().getStart(), Math.max(loc.getStart() - extension, 1));
|
||||
Assert.assertEquals(region.getExtendedLoc().getStop(), Math.min(loc.getStop() + extension, contigLength));
|
||||
Assert.assertEquals(region.getReadSpanLoc().getStart(), Math.max(loc.getStart() - extension, 1));
|
||||
Assert.assertEquals(region.getReadSpanLoc().getStop(), Math.min(loc.getStop() + extension, contigLength));
|
||||
Assert.assertEquals(region.isActive(), isActive);
|
||||
Assert.assertEquals(region.getExtension(), extension);
|
||||
Assert.assertEquals(region.getReads(), Collections.emptyList());
|
||||
Assert.assertEquals(region.size(), 0);
|
||||
Assert.assertEquals(region.getSupportingStates(), supportingStates == null ? Collections.emptyList() : supportingStates);
|
||||
Assert.assertNotNull(region.toString());
|
||||
|
||||
assertGoodReferenceGetter(region.getActiveRegionReference(seq), region.getExtendedLoc(), 0);
|
||||
assertGoodReferenceGetter(region.getActiveRegionReference(seq, 10), region.getExtendedLoc(), 10);
|
||||
assertGoodReferenceGetter(region.getFullReference(seq), region.getReadSpanLoc(), 0);
|
||||
assertGoodReferenceGetter(region.getFullReference(seq, 10), region.getReadSpanLoc(), 10);
|
||||
}
|
||||
|
||||
private void assertGoodReferenceGetter(final byte[] actualBytes, final GenomeLoc span, final int padding) {
|
||||
final int expectedStart = Math.max(span.getStart() - padding, 1);
|
||||
final int expectedStop = Math.min(span.getStop() + padding, contigLength);
|
||||
final byte[] expectedBytes = seq.getSubsequenceAt(span.getContig(), expectedStart, expectedStop).getBases();
|
||||
Assert.assertEquals(actualBytes, expectedBytes);
|
||||
}
|
||||
|
||||
@DataProvider(name = "ActiveRegionReads")
|
||||
public Object[][] makeActiveRegionReads() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary());
|
||||
for ( final int start : Arrays.asList(1, 10, 100, contigLength - 10, contigLength - 1) ) {
|
||||
for ( final int readStartOffset : Arrays.asList(-100, -10, 0, 10, 100) ) {
|
||||
for ( final int readSize : Arrays.asList(10, 100, 1000) ) {
|
||||
final GenomeLoc loc = genomeLocParser.createGenomeLocOnContig(contig, start, start + 10);
|
||||
|
||||
final int readStart = Math.max(start + readStartOffset, 1);
|
||||
final int readStop = Math.min(readStart + readSize, contigLength);
|
||||
final int readLength = readStop - readStart + 1;
|
||||
if ( readLength > 0 ) {
|
||||
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, readStart, readLength);
|
||||
final GenomeLoc readLoc = genomeLocParser.createGenomeLoc(read);
|
||||
if ( readLoc.overlapsP(loc) )
|
||||
tests.add(new Object[]{loc, read});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "ActiveRegionReads")
|
||||
public void testActiveRegionReads(final GenomeLoc loc, final GATKSAMRecord read) {
|
||||
final GenomeLoc expectedSpan = loc.union(genomeLocParser.createGenomeLoc(read));
|
||||
|
||||
final ActiveRegion region = new ActiveRegion(loc, null, true, genomeLocParser, 0);
|
||||
final ActiveRegion region2 = new ActiveRegion(loc, null, true, genomeLocParser, 0);
|
||||
Assert.assertEquals(region.getReads(), Collections.emptyList());
|
||||
Assert.assertEquals(region.size(), 0);
|
||||
Assert.assertEquals(region.getExtendedLoc(), loc);
|
||||
Assert.assertEquals(region.getReadSpanLoc(), loc);
|
||||
Assert.assertTrue(region.equalExceptReads(region2));
|
||||
|
||||
region.add(read);
|
||||
Assert.assertEquals(region.getReads(), Collections.singletonList(read));
|
||||
Assert.assertEquals(region.size(), 1);
|
||||
Assert.assertEquals(region.getExtendedLoc(), loc);
|
||||
Assert.assertEquals(region.getReadSpanLoc(), expectedSpan);
|
||||
Assert.assertTrue(region.equalExceptReads(region2));
|
||||
|
||||
region.clearReads();
|
||||
Assert.assertEquals(region.getReads(), Collections.emptyList());
|
||||
Assert.assertEquals(region.size(), 0);
|
||||
Assert.assertEquals(region.getExtendedLoc(), loc);
|
||||
Assert.assertEquals(region.getReadSpanLoc(), loc);
|
||||
Assert.assertTrue(region.equalExceptReads(region2));
|
||||
|
||||
region.addAll(Collections.singleton(read));
|
||||
Assert.assertEquals(region.getReads(), Collections.singletonList(read));
|
||||
Assert.assertEquals(region.size(), 1);
|
||||
Assert.assertEquals(region.getExtendedLoc(), loc);
|
||||
Assert.assertEquals(region.getReadSpanLoc(), expectedSpan);
|
||||
Assert.assertTrue(region.equalExceptReads(region2));
|
||||
|
||||
region.removeAll(Collections.<GATKSAMRecord>emptyList());
|
||||
Assert.assertEquals(region.getReads(), Collections.singletonList(read));
|
||||
Assert.assertEquals(region.size(), 1);
|
||||
Assert.assertEquals(region.getExtendedLoc(), loc);
|
||||
Assert.assertEquals(region.getReadSpanLoc(), expectedSpan);
|
||||
Assert.assertTrue(region.equalExceptReads(region2));
|
||||
|
||||
region.removeAll(Collections.singletonList(read));
|
||||
Assert.assertEquals(region.getReads(), Collections.emptyList());
|
||||
Assert.assertEquals(region.size(), 0);
|
||||
Assert.assertEquals(region.getExtendedLoc(), loc);
|
||||
Assert.assertEquals(region.getReadSpanLoc(), loc);
|
||||
Assert.assertTrue(region.equalExceptReads(region2));
|
||||
|
||||
region.add(read);
|
||||
region.hardClipToActiveRegion();
|
||||
Assert.assertEquals(region.size(), 1);
|
||||
Assert.assertEquals(region.getExtendedLoc(), loc);
|
||||
Assert.assertEquals(region.getReadSpanLoc(), loc);
|
||||
Assert.assertTrue(region.getReads().get(0).getAlignmentStart() >= region.getExtendedLoc().getStart());
|
||||
Assert.assertTrue(region.getReads().get(0).getAlignmentEnd() <= region.getExtendedLoc().getStop());
|
||||
}
|
||||
|
||||
@DataProvider(name = "BadReadsTest")
|
||||
public Object[][] makeBadReadsTest() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary());
|
||||
tests.add(new Object[]{
|
||||
ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 10, 10),
|
||||
ArtificialSAMUtils.createArtificialRead(header, "read2", 0, 9, 10)});
|
||||
tests.add(new Object[]{
|
||||
ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 10, 10),
|
||||
ArtificialSAMUtils.createArtificialRead(header, "read2", 1, 9, 10)});
|
||||
tests.add(new Object[]{
|
||||
ArtificialSAMUtils.createArtificialRead(header, "read1", 1, 10, 10),
|
||||
ArtificialSAMUtils.createArtificialRead(header, "read2", 0, 9, 10)});
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "BadReadsTest", expectedExceptions = IllegalArgumentException.class)
|
||||
public void testBadReads(final GATKSAMRecord read1, final GATKSAMRecord read2) {
|
||||
final GenomeLoc loc = genomeLocParser.createGenomeLoc(read1);
|
||||
final ActiveRegion region = new ActiveRegion(loc, null, true, genomeLocParser, 0);
|
||||
region.add(read1);
|
||||
region.add(read2);
|
||||
}
|
||||
}
|
||||
|
|
@ -276,7 +276,7 @@ public class ActivityProfileUnitTest extends BaseTest {
|
|||
Assert.assertTrue(regionOffset >= 0 && regionOffset < probs.size(), "Region " + region + " has a bad offset w.r.t. start");
|
||||
for ( int j = 0; j < region.getLocation().size(); j++ ) {
|
||||
final int siteOffset = j + regionOffset;
|
||||
Assert.assertEquals(region.isActive, probs.get(siteOffset).booleanValue());
|
||||
Assert.assertEquals(region.isActive(), probs.get(siteOffset).booleanValue());
|
||||
Assert.assertFalse(seenSites.get(siteOffset), "Site " + j + " in " + region + " was seen already");
|
||||
seenSites.set(siteOffset, true);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -309,8 +309,8 @@ public class BandPassActivityProfileUnitTest extends BaseTest {
|
|||
lastPosSeen = region.getLocation().getStop();
|
||||
|
||||
for ( final ActivityProfileState state : region.getSupportingStates() ) {
|
||||
Assert.assertEquals(state.isActiveProb > ActivityProfile.ACTIVE_PROB_THRESHOLD, region.isActive,
|
||||
"Region is active=" + region.isActive + " but contains a state " + state + " with prob "
|
||||
Assert.assertEquals(state.isActiveProb > ActivityProfile.ACTIVE_PROB_THRESHOLD, region.isActive(),
|
||||
"Region is active=" + region.isActive() + " but contains a state " + state + " with prob "
|
||||
+ state.isActiveProb + " not within expected values given threshold for activity of "
|
||||
+ ActivityProfile.ACTIVE_PROB_THRESHOLD);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue