Merge pull request #128 from broadinstitute/eb_rr_polyploid_compression_GSA-639
This commit is contained in:
commit
a2b69790a6
|
|
@ -314,10 +314,6 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
}
|
||||
|
||||
private static void updateTable(final int[][] table, final Allele allele, final GATKSAMRecord read, final Allele ref, final Allele alt, final int representativeCount) {
|
||||
// ignore reduced reads because they are always on the forward strand!
|
||||
// TODO -- when het compression is enabled in RR, we somehow need to allow those reads through into the Fisher test
|
||||
if ( read.isReducedRead() )
|
||||
return;
|
||||
|
||||
final boolean matchesRef = allele.equals(ref, true);
|
||||
final boolean matchesAlt = allele.equals(alt, true);
|
||||
|
|
@ -326,12 +322,17 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
final int row = matchesRef ? 0 : 1;
|
||||
|
||||
if ( read.isStrandless() ) {
|
||||
// a strandless read counts as observations on both strand, at 50% weight, with a minimum of 1
|
||||
// (the 1 is to ensure that a strandless read always counts as an observation on both strands, even
|
||||
// if the read is only seen once, because it's a merged read or other)
|
||||
final int toAdd = Math.max(representativeCount / 2, 1);
|
||||
table[row][0] += toAdd;
|
||||
table[row][1] += toAdd;
|
||||
|
||||
// ignore strandless reduced reads because they are always on the forward strand!
|
||||
if ( !read.isReducedRead() ) {
|
||||
|
||||
// a strandless read counts as observations on both strand, at 50% weight, with a minimum of 1
|
||||
// (the 1 is to ensure that a strandless read always counts as an observation on both strands, even
|
||||
// if the read is only seen once, because it's a merged read or other)
|
||||
final int toAdd = Math.max(representativeCount / 2, 1);
|
||||
table[row][0] += toAdd;
|
||||
table[row][1] += toAdd;
|
||||
}
|
||||
} else {
|
||||
// a normal read with an actual strand
|
||||
final boolean isFW = !read.getReadNegativeStrandFlag();
|
||||
|
|
|
|||
|
|
@ -53,6 +53,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa
|
|||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
|
@ -88,10 +89,12 @@ public class SpanningDeletions extends InfoFieldAnnotation implements StandardAn
|
|||
int deletions = 0;
|
||||
int depth = 0;
|
||||
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
|
||||
AlignmentContext context = sample.getValue();
|
||||
final ReadBackedPileup pileup = context.getBasePileup();
|
||||
deletions += pileup.getNumberOfDeletions();
|
||||
depth += pileup.getNumberOfElements();
|
||||
for ( final PileupElement p : sample.getValue().getBasePileup() ) {
|
||||
final int actualSampleDepth = p.getRepresentativeCount();
|
||||
depth += actualSampleDepth;
|
||||
if ( p.isDeletion() )
|
||||
deletions += actualSampleDepth;
|
||||
}
|
||||
}
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(getKeyNames().get(0), String.format("%.2f", depth == 0 ? 0.0 : (double)deletions/(double)depth));
|
||||
|
|
|
|||
|
|
@ -50,6 +50,10 @@ import it.unimi.dsi.fastutil.ints.IntArrayList;
|
|||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* The element that describes the header of the sliding window.
|
||||
|
|
@ -264,29 +268,56 @@ public class HeaderElement {
|
|||
}
|
||||
|
||||
/**
|
||||
* Calculates the number of haplotypes necessary to represent this site.
|
||||
* Calculates the number of alleles necessary to represent this site.
|
||||
*
|
||||
* @param minVariantProportion the minimum proportion to call a site variant.
|
||||
* @return the number of alleles necessary to represent this site.
|
||||
* @param allowDeletions should we allow deletions?
|
||||
* @return the number of alleles necessary to represent this site or -1 if allowDeletions is false and there are a sufficient number of them
|
||||
*/
|
||||
public int getNumberOfAlleles(final double minVariantProportion) {
|
||||
public int getNumberOfAlleles(final double minVariantProportion, final boolean allowDeletions) {
|
||||
final List<BaseIndex> alleles = getAlleles(minVariantProportion, allowDeletions);
|
||||
return alleles == null ? -1 : alleles.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the alleles necessary to represent this site.
|
||||
*
|
||||
* @param minVariantProportion the minimum proportion to call a site variant.
|
||||
* @param allowDeletions should we allow deletions?
|
||||
* @return the list of alleles necessary to represent this site or null if allowDeletions is false and there are a sufficient number of them
|
||||
*/
|
||||
public List<BaseIndex> getAlleles(final double minVariantProportion, final boolean allowDeletions) {
|
||||
final int totalBaseCount = consensusBaseCounts.totalCount();
|
||||
if (totalBaseCount == 0)
|
||||
return 0;
|
||||
if ( totalBaseCount == 0 )
|
||||
return Collections.emptyList();
|
||||
|
||||
final int minBaseCountForRelevantAlleles = (int)(minVariantProportion * totalBaseCount);
|
||||
final int minBaseCountForRelevantAlleles = Math.max(1, (int)(minVariantProportion * totalBaseCount));
|
||||
|
||||
int nAlleles = 0;
|
||||
for ( BaseIndex base : BaseIndex.values() ) {
|
||||
final List<BaseIndex> alleles = new ArrayList<BaseIndex>(4);
|
||||
for ( final BaseIndex base : BaseIndex.values() ) {
|
||||
final int baseCount = consensusBaseCounts.countOfBase(base);
|
||||
|
||||
// don't consider this allele if the count is 0
|
||||
if ( baseCount == 0 )
|
||||
continue;
|
||||
|
||||
if ( baseCount >= minBaseCountForRelevantAlleles )
|
||||
nAlleles++;
|
||||
if ( baseCount >= minBaseCountForRelevantAlleles ) {
|
||||
if ( !allowDeletions && base == BaseIndex.D )
|
||||
return null;
|
||||
alleles.add(base);
|
||||
}
|
||||
}
|
||||
return nAlleles;
|
||||
return alleles;
|
||||
}
|
||||
|
||||
/*
|
||||
* Checks whether there are a significant number of softclips.
|
||||
*
|
||||
* @param minVariantProportion the minimum proportion to consider something significant.
|
||||
* @return true if there are significant softclips, false otherwise
|
||||
*/
|
||||
public boolean hasSignificantSoftclips(final double minVariantProportion) {
|
||||
final int totalBaseCount = consensusBaseCounts.totalCount();
|
||||
if ( totalBaseCount == 0 )
|
||||
return false;
|
||||
|
||||
final int minBaseCountForSignificance = Math.max(1, (int)(minVariantProportion * totalBaseCount));
|
||||
return nSoftClippedBases >= minBaseCountForSignificance;
|
||||
}
|
||||
}
|
||||
|
|
@ -46,9 +46,11 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import it.unimi.dsi.fastutil.objects.*;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
|
@ -97,43 +99,62 @@ public class MultiSampleCompressor {
|
|||
final double minAltProportionToTriggerVariant,
|
||||
final double minIndelProportionToTriggerVariant,
|
||||
final int minBaseQual,
|
||||
final ReduceReads.DownsampleStrategy downsampleStrategy,
|
||||
final boolean allowPolyploidReduction) {
|
||||
final ReduceReads.DownsampleStrategy downsampleStrategy) {
|
||||
for ( String name : SampleUtils.getSAMFileSamples(header) ) {
|
||||
compressorsPerSample.put(name,
|
||||
new SingleSampleCompressor(contextSize, downsampleCoverage,
|
||||
minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, allowPolyploidReduction));
|
||||
minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy));
|
||||
}
|
||||
}
|
||||
|
||||
public ObjectSet<GATKSAMRecord> addAlignment(GATKSAMRecord read) {
|
||||
/**
|
||||
* Add an alignment to the compressor
|
||||
*
|
||||
* @param read the read to be added
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window)
|
||||
*/
|
||||
public ObjectSet<GATKSAMRecord> addAlignment(final GATKSAMRecord read, final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
String sampleName = read.getReadGroup().getSample();
|
||||
SingleSampleCompressor compressor = compressorsPerSample.get(sampleName);
|
||||
if ( compressor == null )
|
||||
throw new ReviewedStingException("No compressor for sample " + sampleName);
|
||||
Pair<ObjectSet<GATKSAMRecord>, CompressionStash> readsAndStash = compressor.addAlignment(read);
|
||||
Pair<ObjectSet<GATKSAMRecord>, CompressionStash> readsAndStash = compressor.addAlignment(read, knownSnpPositions);
|
||||
ObjectSet<GATKSAMRecord> reads = readsAndStash.getFirst();
|
||||
CompressionStash regions = readsAndStash.getSecond();
|
||||
|
||||
reads.addAll(closeVariantRegionsInAllSamples(regions));
|
||||
reads.addAll(closeVariantRegionsInAllSamples(regions, knownSnpPositions));
|
||||
|
||||
return reads;
|
||||
}
|
||||
|
||||
public ObjectSet<GATKSAMRecord> close() {
|
||||
/**
|
||||
* Properly closes the compressor.
|
||||
*
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return A non-null set/list of all reads generated
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public ObjectSet<GATKSAMRecord> close(final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
ObjectSet<GATKSAMRecord> reads = new ObjectAVLTreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
for ( SingleSampleCompressor sample : compressorsPerSample.values() ) {
|
||||
Pair<ObjectSet<GATKSAMRecord>, CompressionStash> readsAndStash = sample.close();
|
||||
reads = readsAndStash.getFirst();
|
||||
Pair<ObjectSet<GATKSAMRecord>, CompressionStash> readsAndStash = sample.close(knownSnpPositions);
|
||||
reads.addAll(readsAndStash.getFirst());
|
||||
}
|
||||
return reads;
|
||||
}
|
||||
|
||||
private ObjectSet<GATKSAMRecord> closeVariantRegionsInAllSamples(CompressionStash regions) {
|
||||
/**
|
||||
* Finalizes current variant regions.
|
||||
*
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return A non-null set/list of all reads generated
|
||||
*/
|
||||
private ObjectSet<GATKSAMRecord> closeVariantRegionsInAllSamples(final CompressionStash regions, final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
ObjectSet<GATKSAMRecord> reads = new ObjectAVLTreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
if (!regions.isEmpty()) {
|
||||
for (SingleSampleCompressor sample : compressorsPerSample.values()) {
|
||||
reads.addAll(sample.closeVariantRegions(regions));
|
||||
reads.addAll(sample.closeVariantRegions(regions, knownSnpPositions));
|
||||
}
|
||||
}
|
||||
return reads;
|
||||
|
|
|
|||
|
|
@ -54,9 +54,7 @@ import net.sf.samtools.SAMFileHeader;
|
|||
import net.sf.samtools.SAMFileWriter;
|
||||
import net.sf.samtools.SAMProgramRecord;
|
||||
import net.sf.samtools.util.SequenceUtil;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
|
|
@ -75,6 +73,10 @@ import org.broadinstitute.sting.utils.help.HelpConstants;
|
|||
import org.broadinstitute.sting.utils.sam.BySampleSAMFileWriter;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -147,10 +149,12 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
|
|||
private byte minTailQuality = 2;
|
||||
|
||||
/**
|
||||
* Allow the experimental polyploid-based reduction capabilities of this tool
|
||||
* Any number of VCF files representing known SNPs to be used for the experimental polyploid-based reduction.
|
||||
* Could be e.g. dbSNP and/or official 1000 Genomes SNP calls. Non-SNP variants in these files will be ignored.
|
||||
* Note that polyploid ("het") compression will work only when a single SNP is present in a consensus window.
|
||||
*/
|
||||
@Argument(fullName = "allow_polyploid_reduction", shortName = "polyploid", doc = "", required = false)
|
||||
private boolean USE_POLYPLOID_REDUCTION = false;
|
||||
@Input(fullName="known_sites_for_polyploid_reduction", shortName = "known", doc="Input VCF file(s) with known SNPs", required=false)
|
||||
public List<RodBinding<VariantContext>> known = Collections.emptyList();
|
||||
|
||||
/**
|
||||
* Do not simplify read (strip away all extra information of the read -- anything other than bases, quals
|
||||
|
|
@ -249,6 +253,8 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
|
|||
|
||||
ObjectSortedSet<GenomeLoc> intervalList;
|
||||
|
||||
final ObjectSortedSet<GenomeLoc> knownSnpPositions = new ObjectAVLTreeSet<GenomeLoc>();
|
||||
|
||||
// IMPORTANT: DO NOT CHANGE THE VALUE OF THIS CONSTANT VARIABLE; IT IS NOW PERMANENTLY THE @PG NAME THAT EXTERNAL TOOLS LOOK FOR IN THE BAM HEADER
|
||||
public static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag
|
||||
private static final String PROGRAM_FILENAME_EXTENSION = ".reduced.bam";
|
||||
|
|
@ -359,8 +365,22 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
|
|||
for (GATKSAMRecord mappedRead : mappedReads)
|
||||
System.out.printf("MAPPED: %s %d %d\n", mappedRead.getCigar(), mappedRead.getAlignmentStart(), mappedRead.getAlignmentEnd());
|
||||
|
||||
return mappedReads;
|
||||
// add the SNPs to the list of known positions
|
||||
populateKnownSNPs(metaDataTracker);
|
||||
|
||||
return mappedReads;
|
||||
}
|
||||
|
||||
/*
|
||||
* Add the positions of known SNPs to the set so that we can keep track of it
|
||||
*
|
||||
* @param metaDataTracker the ref meta data tracker
|
||||
*/
|
||||
protected void populateKnownSNPs(final RefMetaDataTracker metaDataTracker) {
|
||||
for ( final VariantContext vc : metaDataTracker.getValues(known) ) {
|
||||
if ( vc.isSNP() )
|
||||
knownSnpPositions.add(getToolkit().getGenomeLocParser().createGenomeLoc(vc));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -373,7 +393,7 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
|
|||
*/
|
||||
@Override
|
||||
public ReduceReadsStash reduceInit() {
|
||||
return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, USE_POLYPLOID_REDUCTION));
|
||||
return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -405,7 +425,7 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
|
|||
if (debugLevel == 1)
|
||||
System.out.println("REDUCE: " + readReady.getCigar() + " " + readReady.getAlignmentStart() + " " + readReady.getAlignmentEnd());
|
||||
|
||||
for (GATKSAMRecord compressedRead : stash.compress(readReady))
|
||||
for (GATKSAMRecord compressedRead : stash.compress(readReady, knownSnpPositions))
|
||||
outputRead(compressedRead);
|
||||
|
||||
// We only care about maintaining the link between read pairs if they are in the same variant
|
||||
|
|
@ -422,6 +442,10 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
|
|||
firstRead = false;
|
||||
}
|
||||
|
||||
// reduce memory requirements by removing old positions
|
||||
if ( !mappedReads.isEmpty() )
|
||||
clearStaleKnownPositions(mappedReads.get(0));
|
||||
|
||||
return stash;
|
||||
}
|
||||
|
||||
|
|
@ -434,13 +458,38 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
|
|||
public void onTraversalDone(ReduceReadsStash stash) {
|
||||
|
||||
// output any remaining reads in the compressor
|
||||
for (GATKSAMRecord read : stash.close())
|
||||
for (GATKSAMRecord read : stash.close(knownSnpPositions))
|
||||
outputRead(read);
|
||||
|
||||
if (nwayout)
|
||||
writerToUse.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes known positions that are no longer relevant for use with het compression.
|
||||
*
|
||||
* @param read the current read, used for checking whether there are stale positions we can remove
|
||||
*/
|
||||
protected void clearStaleKnownPositions(final GATKSAMRecord read) {
|
||||
// nothing to clear if empty
|
||||
if ( knownSnpPositions.isEmpty() )
|
||||
return;
|
||||
|
||||
// not ready to be cleared until we encounter a read from a different contig
|
||||
final int contigIndexOfRead = read.getReferenceIndex();
|
||||
if ( knownSnpPositions.first().getContigIndex() == contigIndexOfRead )
|
||||
return;
|
||||
|
||||
// because we expect most elements to be stale, it's not going to be efficient to remove them one at a time
|
||||
final ObjectAVLTreeSet<GenomeLoc> goodLocs = new ObjectAVLTreeSet<GenomeLoc>();
|
||||
for ( final GenomeLoc loc : knownSnpPositions ) {
|
||||
if ( loc.getContigIndex() == contigIndexOfRead )
|
||||
goodLocs.add(loc);
|
||||
}
|
||||
knownSnpPositions.clear();
|
||||
knownSnpPositions.addAll(goodLocs);
|
||||
}
|
||||
|
||||
/**
|
||||
* Hard clips away all parts of the read that doesn't agree with the intervals selected.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -46,6 +46,8 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.ObjectSortedSet;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
|
@ -106,11 +108,12 @@ public class ReduceReadsStash {
|
|||
/**
|
||||
* sends the read to the MultiSampleCompressor
|
||||
*
|
||||
* @param read the read to be compressed
|
||||
* @param read the read to be compressed
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window)
|
||||
*/
|
||||
public Iterable<GATKSAMRecord> compress(GATKSAMRecord read) {
|
||||
return compressor.addAlignment(read);
|
||||
public Iterable<GATKSAMRecord> compress(final GATKSAMRecord read, final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
return compressor.addAlignment(read, knownSnpPositions);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -125,18 +128,19 @@ public class ReduceReadsStash {
|
|||
/**
|
||||
* Close the stash, processing all remaining reads in order
|
||||
*
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return a list of all the reads produced by the SlidingWindow machinery)
|
||||
*/
|
||||
public Iterable<GATKSAMRecord> close() {
|
||||
public Iterable<GATKSAMRecord> close(final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
LinkedList<GATKSAMRecord> result = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
// compress all the stashed reads (in order)
|
||||
for (GATKSAMRecord read : outOfOrderReads)
|
||||
for (GATKSAMRecord compressedRead : compressor.addAlignment(read))
|
||||
for (GATKSAMRecord compressedRead : compressor.addAlignment(read, knownSnpPositions))
|
||||
result.add(compressedRead);
|
||||
|
||||
// output any remaining reads from the compressor
|
||||
for (GATKSAMRecord read : compressor.close())
|
||||
for (GATKSAMRecord read : compressor.close(knownSnpPositions))
|
||||
result.add(read);
|
||||
|
||||
return result;
|
||||
|
|
|
|||
|
|
@ -46,7 +46,9 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import it.unimi.dsi.fastutil.objects.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
|
@ -64,7 +66,6 @@ public class SingleSampleCompressor {
|
|||
final private double minIndelProportionToTriggerVariant;
|
||||
final private int minBaseQual;
|
||||
final private ReduceReads.DownsampleStrategy downsampleStrategy;
|
||||
final private boolean allowPolyploidReduction;
|
||||
|
||||
private SlidingWindow slidingWindow;
|
||||
private int slidingWindowCounter;
|
||||
|
|
@ -77,8 +78,7 @@ public class SingleSampleCompressor {
|
|||
final double minAltProportionToTriggerVariant,
|
||||
final double minIndelProportionToTriggerVariant,
|
||||
final int minBaseQual,
|
||||
final ReduceReads.DownsampleStrategy downsampleStrategy,
|
||||
final boolean allowPolyploidReduction) {
|
||||
final ReduceReads.DownsampleStrategy downsampleStrategy) {
|
||||
this.contextSize = contextSize;
|
||||
this.downsampleCoverage = downsampleCoverage;
|
||||
this.minMappingQuality = minMappingQuality;
|
||||
|
|
@ -87,10 +87,16 @@ public class SingleSampleCompressor {
|
|||
this.minIndelProportionToTriggerVariant = minIndelProportionToTriggerVariant;
|
||||
this.minBaseQual = minBaseQual;
|
||||
this.downsampleStrategy = downsampleStrategy;
|
||||
this.allowPolyploidReduction = allowPolyploidReduction;
|
||||
}
|
||||
|
||||
public Pair<ObjectSet<GATKSAMRecord>, CompressionStash> addAlignment( GATKSAMRecord read ) {
|
||||
/**
|
||||
* Add an alignment to the compressor
|
||||
*
|
||||
* @param read the read to be added
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window)
|
||||
*/
|
||||
public Pair<ObjectSet<GATKSAMRecord>, CompressionStash> addAlignment( final GATKSAMRecord read, final ObjectSortedSet<GenomeLoc> knownSnpPositions ) {
|
||||
ObjectSet<GATKSAMRecord> reads = new ObjectAVLTreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
CompressionStash stash = new CompressionStash();
|
||||
int readOriginalStart = read.getUnclippedStart();
|
||||
|
|
@ -101,14 +107,14 @@ public class SingleSampleCompressor {
|
|||
(readOriginalStart - contextSize > slidingWindow.getStopLocation()))) { // this read is too far away from the end of the current sliding window
|
||||
|
||||
// close the current sliding window
|
||||
Pair<ObjectSet<GATKSAMRecord>, CompressionStash> readsAndStash = slidingWindow.close();
|
||||
Pair<ObjectSet<GATKSAMRecord>, CompressionStash> readsAndStash = slidingWindow.close(knownSnpPositions);
|
||||
reads = readsAndStash.getFirst();
|
||||
stash = readsAndStash.getSecond();
|
||||
slidingWindow = null; // so we create a new one on the next if
|
||||
}
|
||||
|
||||
if ( slidingWindow == null) { // this is the first read
|
||||
slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), allowPolyploidReduction);
|
||||
slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities());
|
||||
slidingWindowCounter++;
|
||||
}
|
||||
|
||||
|
|
@ -116,12 +122,26 @@ public class SingleSampleCompressor {
|
|||
return new Pair<ObjectSet<GATKSAMRecord>, CompressionStash>(reads, stash);
|
||||
}
|
||||
|
||||
public Pair<ObjectSet<GATKSAMRecord>, CompressionStash> close() {
|
||||
return (slidingWindow != null) ? slidingWindow.close() : emptyPair;
|
||||
/**
|
||||
* Properly closes the compressor.
|
||||
*
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return A non-null set/list of all reads generated
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public Pair<ObjectSet<GATKSAMRecord>, CompressionStash> close(final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
return (slidingWindow != null) ? slidingWindow.close(knownSnpPositions) : emptyPair;
|
||||
}
|
||||
|
||||
public ObjectSet<GATKSAMRecord> closeVariantRegions(CompressionStash regions) {
|
||||
return slidingWindow == null ? ObjectSets.EMPTY_SET : slidingWindow.closeVariantRegions(regions);
|
||||
/**
|
||||
* Finalizes current variant regions.
|
||||
*
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return A non-null set/list of all reads generated
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public ObjectSet<GATKSAMRecord> closeVariantRegions(final CompressionStash regions, final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
return slidingWindow == null ? ObjectSets.EMPTY_SET : slidingWindow.closeVariantRegions(regions, knownSnpPositions);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -57,6 +57,7 @@ import net.sf.samtools.SAMFileHeader;
|
|||
import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.recalibration.EventType;
|
||||
|
|
@ -65,10 +66,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
|||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.ListIterator;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -109,8 +107,6 @@ public class SlidingWindow {
|
|||
protected ReduceReads.DownsampleStrategy downsampleStrategy;
|
||||
private boolean hasIndelQualities;
|
||||
|
||||
private boolean allowPolyploidReductionInGeneral;
|
||||
|
||||
private static CompressionStash emptyRegions = new CompressionStash();
|
||||
|
||||
/**
|
||||
|
|
@ -154,7 +150,7 @@ public class SlidingWindow {
|
|||
this.readsInWindow = new ObjectAVLTreeSet<GATKSAMRecord>();
|
||||
}
|
||||
|
||||
public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, boolean allowPolyploidReduction) {
|
||||
public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities) {
|
||||
this.contextSize = contextSize;
|
||||
this.downsampleCoverage = downsampleCoverage;
|
||||
|
||||
|
|
@ -188,8 +184,6 @@ public class SlidingWindow {
|
|||
|
||||
this.downsampleStrategy = downsampleStrategy;
|
||||
this.hasIndelQualities = hasIndelQualities;
|
||||
|
||||
this.allowPolyploidReductionInGeneral = allowPolyploidReduction;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -403,12 +397,12 @@ public class SlidingWindow {
|
|||
* @param header the window header
|
||||
* @param start the first header index to add to consensus
|
||||
* @param end the first header index NOT TO add to consensus
|
||||
* @param isNegativeStrand should the synthetic read be represented as being on the negative strand?
|
||||
* @param strandType the strandedness that the synthetic read should be represented as having
|
||||
* @return a non-null list of consensus reads generated by this call. Empty list if no consensus was generated.
|
||||
*/
|
||||
@Requires({"start >= 0 && (end >= start || end == 0)"})
|
||||
@Ensures("result != null")
|
||||
protected ObjectArrayList<GATKSAMRecord> addToSyntheticReads(LinkedList<HeaderElement> header, int start, int end, boolean isNegativeStrand) {
|
||||
protected ObjectArrayList<GATKSAMRecord> addToSyntheticReads(final LinkedList<HeaderElement> header, final int start, final int end, final SyntheticRead.StrandType strandType) {
|
||||
ObjectArrayList<GATKSAMRecord> reads = new ObjectArrayList<GATKSAMRecord>();
|
||||
if (start < end) {
|
||||
ListIterator<HeaderElement> headerElementIterator = header.listIterator(start);
|
||||
|
|
@ -422,22 +416,22 @@ public class SlidingWindow {
|
|||
reads.addAll(finalizeAndAdd(ConsensusType.FILTERED));
|
||||
|
||||
int endOfConsensus = findNextNonConsensusElement(header, start, end);
|
||||
addToRunningConsensus(header, start, endOfConsensus, isNegativeStrand);
|
||||
addToRunningConsensus(header, start, endOfConsensus, strandType);
|
||||
|
||||
if (endOfConsensus <= start)
|
||||
throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfConsensus, start));
|
||||
|
||||
reads.addAll(addToSyntheticReads(header, endOfConsensus, end, isNegativeStrand));
|
||||
reads.addAll(addToSyntheticReads(header, endOfConsensus, end, strandType));
|
||||
} else if (headerElement.hasFilteredData()) {
|
||||
reads.addAll(finalizeAndAdd(ConsensusType.CONSENSUS));
|
||||
|
||||
int endOfFilteredData = findNextNonFilteredDataElement(header, start, end);
|
||||
reads.addAll(addToFilteredData(header, start, endOfFilteredData, isNegativeStrand));
|
||||
reads.addAll(addToFilteredData(header, start, endOfFilteredData, strandType));
|
||||
|
||||
if (endOfFilteredData <= start)
|
||||
throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFilteredData, start));
|
||||
|
||||
reads.addAll(addToSyntheticReads(header, endOfFilteredData, end, isNegativeStrand));
|
||||
reads.addAll(addToSyntheticReads(header, endOfFilteredData, end, strandType));
|
||||
} else if (headerElement.isEmpty()) {
|
||||
reads.addAll(finalizeAndAdd(ConsensusType.BOTH));
|
||||
|
||||
|
|
@ -446,7 +440,7 @@ public class SlidingWindow {
|
|||
if (endOfEmptyData <= start)
|
||||
throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfEmptyData, start));
|
||||
|
||||
reads.addAll(addToSyntheticReads(header, endOfEmptyData, end, isNegativeStrand));
|
||||
reads.addAll(addToSyntheticReads(header, endOfEmptyData, end, strandType));
|
||||
} else
|
||||
throw new ReviewedStingException(String.format("Header Element %d is neither Consensus, Data or Empty. Something is wrong.", start));
|
||||
|
||||
|
|
@ -558,16 +552,16 @@ public class SlidingWindow {
|
|||
* @param header the window header
|
||||
* @param start the first header index to add to consensus
|
||||
* @param end the first header index NOT TO add to consensus
|
||||
* @param isNegativeStrand should the synthetic read be represented as being on the negative strand?
|
||||
* @param strandType the strandedness that the synthetic read should be represented as having
|
||||
* @return a non-null list of GATKSAMRecords representing finalized filtered consensus data. Empty list if no consensus was generated.
|
||||
*/
|
||||
@Requires({"start >= 0 && (end >= start || end == 0)"})
|
||||
@Ensures("result != null")
|
||||
private ObjectArrayList<GATKSAMRecord> addToFilteredData(LinkedList<HeaderElement> header, int start, int end, boolean isNegativeStrand) {
|
||||
private ObjectArrayList<GATKSAMRecord> addToFilteredData(final LinkedList<HeaderElement> header, final int start, final int end, final SyntheticRead.StrandType strandType) {
|
||||
ObjectArrayList<GATKSAMRecord> result = new ObjectArrayList<GATKSAMRecord>();
|
||||
|
||||
if (filteredDataConsensus == null)
|
||||
filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), hasIndelQualities, isNegativeStrand);
|
||||
filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), hasIndelQualities, strandType);
|
||||
|
||||
ListIterator<HeaderElement> headerElementIterator = header.listIterator(start);
|
||||
for (int index = start; index < end; index++) {
|
||||
|
|
@ -583,7 +577,7 @@ public class SlidingWindow {
|
|||
|
||||
if ( filteredDataConsensus.getRefStart() + filteredDataConsensus.size() != headerElement.getLocation() ) {
|
||||
result.add(finalizeFilteredDataConsensus());
|
||||
filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, headerElement.getLocation(), hasIndelQualities, isNegativeStrand);
|
||||
filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, headerElement.getLocation(), hasIndelQualities, strandType);
|
||||
}
|
||||
|
||||
genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts(), headerElement.getRMS());
|
||||
|
|
@ -601,12 +595,12 @@ public class SlidingWindow {
|
|||
* @param header the window header
|
||||
* @param start the first header index to add to consensus
|
||||
* @param end the first header index NOT TO add to consensus
|
||||
* @param isNegativeStrand should the synthetic read be represented as being on the negative strand?
|
||||
* @param strandType the strandedness that the synthetic read should be represented as having
|
||||
*/
|
||||
@Requires({"start >= 0 && (end >= start || end == 0)"})
|
||||
private void addToRunningConsensus(LinkedList<HeaderElement> header, int start, int end, boolean isNegativeStrand) {
|
||||
private void addToRunningConsensus(final LinkedList<HeaderElement> header, final int start, final int end, final SyntheticRead.StrandType strandType) {
|
||||
if (runningConsensus == null)
|
||||
runningConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, header.get(start).getLocation(), hasIndelQualities, isNegativeStrand);
|
||||
runningConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, header.get(start).getLocation(), hasIndelQualities, strandType);
|
||||
|
||||
Iterator<HeaderElement> headerElementIterator = header.listIterator(start);
|
||||
for (int index = start; index < end; index++) {
|
||||
|
|
@ -642,29 +636,39 @@ public class SlidingWindow {
|
|||
*
|
||||
* @param start the first window header index in the variant region (inclusive)
|
||||
* @param stop the last window header index of the variant region (inclusive)
|
||||
* @param disallowPolyploidReductionAtThisPosition should we disallow polyploid (het) compression here?
|
||||
* @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here
|
||||
* @return a non-null list of all reads contained in the variant region
|
||||
*/
|
||||
@Requires({"start >= 0 && (stop >= start || stop == 0)"})
|
||||
@Ensures("result != null")
|
||||
protected ObjectList<GATKSAMRecord> compressVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) {
|
||||
protected ObjectList<GATKSAMRecord> compressVariantRegion(final int start, final int stop, final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
ObjectList<GATKSAMRecord> allReads = new ObjectArrayList<GATKSAMRecord>();
|
||||
|
||||
// Try to compress into a polyploid consensus
|
||||
int hetRefPosition = -1;
|
||||
final Object[] header = windowHeader.toArray();
|
||||
// Optimization: don't bother if there are no known SNPs
|
||||
final int hetRefPosition = knownSnpPositions.isEmpty() ? -1 : findSinglePolyploidCompressiblePosition(start, stop);
|
||||
|
||||
if ( allowPolyploidReductionInGeneral && !disallowPolyploidReductionAtThisPosition )
|
||||
hetRefPosition = findSinglePolyploidCompressiblePosition(header, start, stop);
|
||||
boolean successfullyCreatedPolyploidConsensus = false;
|
||||
|
||||
// Try to compress the variant region; note that using the hetRefPosition protects us from trying to compress
|
||||
// variant regions that are created by insertions (since we can't confirm here that they represent the same allele)
|
||||
if ( hetRefPosition != -1 ) {
|
||||
allReads = createPolyploidConsensus(start, stop, ((HeaderElement) header[hetRefPosition]).getLocation());
|
||||
// Note that using the hetRefPosition protects us from trying to compress variant regions that are created by
|
||||
// insertions (which we don't want because we can't confirm that they represent the same allele).
|
||||
// Also, we only allow polyploid consensus creation at known sites.
|
||||
if ( hetRefPosition != -1 && matchesKnownPosition(windowHeader.get(hetRefPosition).getLocation(), knownSnpPositions) ) {
|
||||
|
||||
// try to create the polyploid consensus
|
||||
final ObjectList<GATKSAMRecord> polyploidReads = createPolyploidConsensus(start, stop, hetRefPosition);
|
||||
|
||||
// if successful we are good to go!
|
||||
if ( polyploidReads != null ) {
|
||||
allReads.addAll(polyploidReads);
|
||||
successfullyCreatedPolyploidConsensus = true;
|
||||
}
|
||||
}
|
||||
// Return all reads that overlap the variant region and remove them from the window header entirely
|
||||
// also remove all reads preceding the variant region (since they will be output as consensus right after compression
|
||||
else {
|
||||
|
||||
// if we can't create a polyploid consensus here, return all reads that overlap the variant region and remove them
|
||||
// from the window header entirely; also remove all reads preceding the variant region (since they will be output
|
||||
// as consensus right after compression)
|
||||
if ( !successfullyCreatedPolyploidConsensus ) {
|
||||
final int refStart = windowHeader.get(start).getLocation();
|
||||
final int refStop = windowHeader.get(stop).getLocation();
|
||||
|
||||
|
|
@ -678,35 +682,50 @@ public class SlidingWindow {
|
|||
toRemove.add(read);
|
||||
}
|
||||
}
|
||||
removeReadsFromWindow(toRemove);
|
||||
|
||||
// remove all used reads
|
||||
for ( final GATKSAMRecord read : toRemove )
|
||||
readsInWindow.remove(read);
|
||||
}
|
||||
return allReads;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the given position match one of the known sites
|
||||
*
|
||||
* @param targetPosition the position of the het site
|
||||
* @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here
|
||||
* @return true if the targetPosition matches a known SNP position, false otherwise
|
||||
*/
|
||||
@Requires({"targetPosition >= 1 && knownSnpPositions != null"})
|
||||
protected boolean matchesKnownPosition(final int targetPosition, final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
final GenomeLoc targetLoc = new UnvalidatingGenomeLoc(contig, contigIndex, targetPosition, targetPosition);
|
||||
return knownSnpPositions.contains(targetLoc);
|
||||
}
|
||||
|
||||
/*
|
||||
* Finds the het variant position located within start and stop (inclusive) if one exists.
|
||||
*
|
||||
* @param header the header element array
|
||||
* @param start the first header index in the region to check (inclusive)
|
||||
* @param stop the last header index of the region to check (inclusive)
|
||||
* @return the window header index of the single het position or -1 if either none or more than one exists
|
||||
*/
|
||||
@Requires("header != null && start >= 0 && (stop >= start || stop == 0)")
|
||||
protected int findSinglePolyploidCompressiblePosition(final Object[] header, final int start, final int stop) {
|
||||
@Requires("start >= 0 && (stop >= start || stop == 0)")
|
||||
protected int findSinglePolyploidCompressiblePosition(final int start, final int stop) {
|
||||
int hetRefPosition = -1;
|
||||
|
||||
for ( int i = start; i <= stop; i++ ) {
|
||||
|
||||
final int nAlleles = ((HeaderElement) header[i]).getNumberOfAlleles(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT);
|
||||
final int nAlleles = windowHeader.get(i).getNumberOfAlleles(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, false);
|
||||
|
||||
// we will only work on diploid cases because we just don't want to handle/test other scenarios
|
||||
if ( nAlleles > 2 )
|
||||
// we will only work on diploid non-indel cases because we just don't want to handle/test other scenarios
|
||||
if ( nAlleles > 2 || nAlleles == -1 )
|
||||
return -1;
|
||||
|
||||
if ( nAlleles == 2 ) {
|
||||
|
||||
// make sure that there is only 1 site in the region that contains more than one allele
|
||||
if ( hetRefPosition >= 0 )
|
||||
if ( hetRefPosition != -1 )
|
||||
return -1;
|
||||
|
||||
hetRefPosition = i;
|
||||
|
|
@ -716,21 +735,43 @@ public class SlidingWindow {
|
|||
return hetRefPosition;
|
||||
}
|
||||
|
||||
/*
|
||||
* Checks whether there's a position in the header with a significant number of softclips.
|
||||
*
|
||||
* @param header the window header to examine
|
||||
* @param positionToSkip the global position to skip in the examination (use negative number if you don't want to make use of this argument)
|
||||
* @return true if there exists a position with significant softclips, false otherwise
|
||||
*/
|
||||
@Requires("header != null")
|
||||
protected boolean hasSignificantSoftclipPosition(final List<HeaderElement> header, final int positionToSkip) {
|
||||
|
||||
for ( final HeaderElement headerElement : header ) {
|
||||
|
||||
if ( headerElement.getLocation() == positionToSkip )
|
||||
continue;
|
||||
|
||||
if ( headerElement.hasSignificantSoftclips(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT) )
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finalizes a variant region, any adjacent synthetic reads.
|
||||
*
|
||||
* @param start the first window header index in the variant region (inclusive)
|
||||
* @param stop the last window header index of the variant region (inclusive)
|
||||
* @param disallowPolyploidReductionAtThisPosition should we disallow polyploid (het) compression here?
|
||||
* @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here
|
||||
* @return a non-null list of all reads contained in the variant region plus any adjacent synthetic reads
|
||||
*/
|
||||
@Requires({"start >= 0 && (stop >= start || stop == 0)"})
|
||||
@Ensures("result != null")
|
||||
protected ObjectList<GATKSAMRecord> closeVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) {
|
||||
ObjectList<GATKSAMRecord> allReads = compressVariantRegion(start, stop, disallowPolyploidReductionAtThisPosition);
|
||||
protected ObjectList<GATKSAMRecord> closeVariantRegion(final int start, final int stop, final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
ObjectList<GATKSAMRecord> allReads = compressVariantRegion(start, stop, knownSnpPositions);
|
||||
|
||||
ObjectList<GATKSAMRecord> result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads;
|
||||
result.addAll(addToSyntheticReads(windowHeader, 0, stop, false));
|
||||
result.addAll(addToSyntheticReads(windowHeader, 0, stop+1, SyntheticRead.StrandType.STRANDLESS));
|
||||
result.addAll(finalizeAndAdd(ConsensusType.BOTH));
|
||||
|
||||
return result; // finalized reads will be downsampled if necessary
|
||||
|
|
@ -739,10 +780,11 @@ public class SlidingWindow {
|
|||
/*
|
||||
* Finalizes the list of regions requested (and any regions preceding them)
|
||||
*
|
||||
* @param regions the list of regions to finalize
|
||||
* @param regions the list of regions to finalize
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return a non-null set of reduced reads representing the finalized regions
|
||||
*/
|
||||
public ObjectSet<GATKSAMRecord> closeVariantRegions(final CompressionStash regions) {
|
||||
public ObjectSet<GATKSAMRecord> closeVariantRegions(final CompressionStash regions, final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
final ObjectAVLTreeSet<GATKSAMRecord> allReads = new ObjectAVLTreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
if ( !regions.isEmpty() ) {
|
||||
|
||||
|
|
@ -754,7 +796,7 @@ public class SlidingWindow {
|
|||
final int start = region.getStart() - windowHeaderStart;
|
||||
final int stop = region.getStop() - windowHeaderStart;
|
||||
|
||||
allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1)); // todo -- add condition here dependent on dbSNP track
|
||||
allReads.addAll(closeVariantRegion(start, stop, knownSnpPositions));
|
||||
|
||||
// We need to clean up the window header elements up until the end of the requested region so that they don't get used for future regions.
|
||||
// Note that this cleanup used to happen outside the above for-loop, but that was causing an occasional doubling of the reduced reads
|
||||
|
|
@ -772,6 +814,7 @@ public class SlidingWindow {
|
|||
if ( lastCleanedElement != null && lastCleanedElement.hasInsertionToTheRight() )
|
||||
windowHeader.addFirst(new HeaderElement(lastCleanedElement.getLocation(), lastCleanedElement.numInsertionsToTheRight()));
|
||||
}
|
||||
|
||||
return allReads;
|
||||
}
|
||||
|
||||
|
|
@ -804,10 +847,11 @@ public class SlidingWindow {
|
|||
* regions that still exist regardless of being able to fulfill the
|
||||
* context size requirement in the end.
|
||||
*
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return A non-null set/list of all reads generated
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public Pair<ObjectSet<GATKSAMRecord>, CompressionStash> close() {
|
||||
public Pair<ObjectSet<GATKSAMRecord>, CompressionStash> close(final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
// mark variant regions
|
||||
ObjectSet<GATKSAMRecord> finalizedReads = new ObjectAVLTreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
CompressionStash regions = new CompressionStash();
|
||||
|
|
@ -816,10 +860,10 @@ public class SlidingWindow {
|
|||
if (!windowHeader.isEmpty()) {
|
||||
markSites(getStopLocation(windowHeader) + 1);
|
||||
regions = findVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet(), forceCloseUnfinishedRegions);
|
||||
finalizedReads = closeVariantRegions(regions);
|
||||
finalizedReads = closeVariantRegions(regions, knownSnpPositions);
|
||||
|
||||
if (!windowHeader.isEmpty()) {
|
||||
finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), false));
|
||||
finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), SyntheticRead.StrandType.STRANDLESS));
|
||||
finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH)); // if it ended in running consensus, finish it up
|
||||
}
|
||||
}
|
||||
|
|
@ -863,86 +907,135 @@ public class SlidingWindow {
|
|||
return finalizedRead;
|
||||
}
|
||||
|
||||
// define this so that we can use Java generics below
|
||||
private static class HeaderElementList extends LinkedList<HeaderElement> {}
|
||||
|
||||
/**
|
||||
* Finalizes a variant region, any adjacent synthetic reads.
|
||||
* Finalizes a variant region for point mutations, and any adjacent synthetic reads. Indel sites are not supported.
|
||||
*
|
||||
* @param start the first window header index in the variant region (inclusive)
|
||||
* @param start the first window header index of the variant region (inclusive)
|
||||
* @param stop the last window header index of the variant region (inclusive)
|
||||
* @param hetRefPosition reference position (in global coordinates) of the het site
|
||||
* @return a non-null list of all reads contained in the variant region as a polyploid consensus
|
||||
* @param hetRefPosition window header index of the het site; MUST NOT BE AN INDEL SITE!
|
||||
* @return a list of all reads contained in the variant region as a polyploid consensus, or null if not possible
|
||||
*/
|
||||
@Requires({"start >= 0 && (stop >= start || stop == 0)"})
|
||||
@Ensures("result != null")
|
||||
private ObjectList<GATKSAMRecord> createPolyploidConsensus(final int start, final int stop, final int hetRefPosition) {
|
||||
// we will create two (positive strand, negative strand) headers for each contig
|
||||
ObjectList<LinkedList<HeaderElement>> headersPosStrand = new ObjectArrayList<LinkedList<HeaderElement>>();
|
||||
ObjectList<LinkedList<HeaderElement>> headersNegStrand = new ObjectArrayList<LinkedList<HeaderElement>>();
|
||||
ObjectList<GATKSAMRecord> hetReads = new ObjectArrayList<GATKSAMRecord>();
|
||||
Byte2IntMap haplotypeHeaderMap = new Byte2IntArrayMap(2);
|
||||
int currentHaplotype = 0;
|
||||
int refStart = windowHeader.get(start).getLocation();
|
||||
int refStop = windowHeader.get(stop).getLocation();
|
||||
ObjectList<GATKSAMRecord> toRemove = new ObjectArrayList<GATKSAMRecord>();
|
||||
for (GATKSAMRecord read : readsInWindow) {
|
||||
int haplotype;
|
||||
protected ObjectList<GATKSAMRecord> createPolyploidConsensus(final int start, final int stop, final int hetRefPosition) {
|
||||
// we will create two (positive strand, negative strand) headers for each haplotype
|
||||
final HeaderElementList[] headersPosStrand = new HeaderElementList[2];
|
||||
final HeaderElementList[] headersNegStrand = new HeaderElementList[2];
|
||||
|
||||
// check if the read is either before or inside the variant region
|
||||
if (read.getSoftStart() <= refStop) {
|
||||
// check if the read is inside the variant region
|
||||
if (read.getMappingQuality() >= MIN_MAPPING_QUALITY && read.getSoftEnd() >= refStart) {
|
||||
// check if the read contains the het site
|
||||
if (read.getSoftStart() <= hetRefPosition && read.getSoftEnd() >= hetRefPosition) {
|
||||
int readPos = ReadUtils.getReadCoordinateForReferenceCoordinate(read, hetRefPosition, ReadUtils.ClippingTail.LEFT_TAIL);
|
||||
// TODO -- THIS IS A HUGE BUG AS IT WILL NOT WORK FOR DELETIONS; see commented out unit test
|
||||
byte base = read.getReadBases()[readPos];
|
||||
byte qual = read.getBaseQualities(EventType.BASE_SUBSTITUTION)[readPos];
|
||||
final int refStart = windowHeader.get(start).getLocation();
|
||||
final int refStop = windowHeader.get(stop).getLocation();
|
||||
final int globalHetRefPosition = windowHeader.get(hetRefPosition).getLocation();
|
||||
|
||||
// check if base passes the filters!
|
||||
if (qual >= MIN_BASE_QUAL_TO_COUNT) {
|
||||
// check which haplotype this read represents and take the index of it from the list of headers
|
||||
if (haplotypeHeaderMap.containsKey(base)) {
|
||||
haplotype = haplotypeHeaderMap.get(base);
|
||||
}
|
||||
// create new lists if this haplotype has not been seen yet
|
||||
else {
|
||||
haplotype = currentHaplotype;
|
||||
haplotypeHeaderMap.put(base, currentHaplotype);
|
||||
headersPosStrand.add(new LinkedList<HeaderElement>());
|
||||
headersNegStrand.add(new LinkedList<HeaderElement>());
|
||||
currentHaplotype++;
|
||||
}
|
||||
LinkedList<HeaderElement> header = read.getReadNegativeStrandFlag() ? headersNegStrand.get(haplotype) : headersPosStrand.get(haplotype);
|
||||
// add to the polyploid header
|
||||
// initialize the mapping from base (allele) to header
|
||||
final Byte2IntMap alleleHeaderMap = new Byte2IntArrayMap(2);
|
||||
for ( final BaseIndex allele : windowHeader.get(hetRefPosition).getAlleles(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, false) ) {
|
||||
final int currentIndex = alleleHeaderMap.size();
|
||||
if ( currentIndex > 1 )
|
||||
throw new IllegalStateException("There are more than 2 alleles present when creating a diploid consensus");
|
||||
|
||||
alleleHeaderMap.put(allele.b, currentIndex);
|
||||
headersPosStrand[currentIndex] = new HeaderElementList();
|
||||
headersNegStrand[currentIndex] = new HeaderElementList();
|
||||
}
|
||||
|
||||
// sanity check that we saw 2 alleles
|
||||
if ( alleleHeaderMap.size() != 2 )
|
||||
throw new IllegalStateException("We expected to see 2 alleles when creating a diploid consensus but saw " + alleleHeaderMap.size());
|
||||
|
||||
final ObjectList<GATKSAMRecord> toRemoveFromReadCache = new ObjectArrayList<GATKSAMRecord>();
|
||||
final ObjectList<GATKSAMRecord> toRemoveFromHeader = new ObjectArrayList<GATKSAMRecord>();
|
||||
|
||||
for ( final GATKSAMRecord read : readsInWindow ) {
|
||||
|
||||
// if the read falls after the region, just skip it for now (we'll get to it later)
|
||||
if ( read.getSoftStart() > refStop )
|
||||
continue;
|
||||
|
||||
// if the read falls before the region, remove it
|
||||
if ( read.getSoftEnd() < refStart ) {
|
||||
toRemoveFromReadCache.add(read);
|
||||
continue;
|
||||
}
|
||||
|
||||
// check whether the read spans the het site
|
||||
if ( read.getSoftStart() <= globalHetRefPosition && read.getSoftEnd() >= globalHetRefPosition ) {
|
||||
|
||||
// make sure it meets the minimum mapping quality requirement (if not, we'll remove it and not use it for the consensuses)
|
||||
if ( read.getMappingQuality() >= MIN_MAPPING_QUALITY ) {
|
||||
|
||||
// where on the read is the het position?
|
||||
final int readPosOfHet = ReadUtils.getReadCoordinateForReferenceCoordinate(read, globalHetRefPosition, ReadUtils.ClippingTail.LEFT_TAIL);
|
||||
|
||||
// this is safe because indels are not supported
|
||||
final byte base = read.getReadBases()[readPosOfHet];
|
||||
final byte qual = read.getBaseQualities(EventType.BASE_SUBSTITUTION)[readPosOfHet];
|
||||
|
||||
// make sure that the base passes filters (if not, we'll remove it and not use it for the consensuses)
|
||||
if ( qual >= MIN_BASE_QUAL_TO_COUNT ) {
|
||||
|
||||
// check which allele this read represents
|
||||
final Integer allele = alleleHeaderMap.get(base);
|
||||
|
||||
// ignore the read if it represents a base that's not part of the consensus
|
||||
if ( allele != null ) {
|
||||
// add to the appropriate polyploid header
|
||||
final LinkedList<HeaderElement> header = read.getReadNegativeStrandFlag() ? headersNegStrand[allele] : headersPosStrand[allele];
|
||||
addToHeader(header, read);
|
||||
// remove from the standard header so that we don't double count it
|
||||
removeFromHeader(windowHeader, read);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// we remove all reads before and inside the variant region from the window
|
||||
toRemove.add(read);
|
||||
// remove from the standard header so that we don't double count it
|
||||
toRemoveFromHeader.add(read);
|
||||
}
|
||||
|
||||
// we remove all reads falling inside the variant region from the window
|
||||
toRemoveFromReadCache.add(read);
|
||||
}
|
||||
|
||||
for (LinkedList<HeaderElement> header : headersPosStrand) {
|
||||
if (header.size() > 0)
|
||||
hetReads.addAll(addToSyntheticReads(header, 0, header.size(), false));
|
||||
if (runningConsensus != null)
|
||||
hetReads.add(finalizeRunningConsensus());
|
||||
// sanity check that no new "variant region" exists on just a single consensus strand
|
||||
// due to softclips now that we've broken everything out into their component parts
|
||||
for ( final LinkedList<HeaderElement> header : headersPosStrand ) {
|
||||
if ( hasSignificantSoftclipPosition(header, globalHetRefPosition) )
|
||||
return null;
|
||||
}
|
||||
for (LinkedList<HeaderElement> header : headersNegStrand) {
|
||||
if (header.size() > 0)
|
||||
hetReads.addAll(addToSyntheticReads(header, 0, header.size(), true));
|
||||
if (runningConsensus != null)
|
||||
hetReads.add(finalizeRunningConsensus());
|
||||
for ( final LinkedList<HeaderElement> header : headersNegStrand ) {
|
||||
if ( hasSignificantSoftclipPosition(header, globalHetRefPosition) )
|
||||
return null;
|
||||
}
|
||||
|
||||
removeReadsFromWindow(toRemove);
|
||||
// create the polyploid synthetic reads
|
||||
final ObjectList<GATKSAMRecord> hetReads = new ObjectArrayList<GATKSAMRecord>();
|
||||
for ( final LinkedList<HeaderElement> header : headersPosStrand )
|
||||
finalizeHetConsensus(header, false, hetReads);
|
||||
for ( final LinkedList<HeaderElement> header : headersNegStrand )
|
||||
finalizeHetConsensus(header, true, hetReads);
|
||||
|
||||
// remove all used reads
|
||||
for ( final GATKSAMRecord read : toRemoveFromReadCache )
|
||||
readsInWindow.remove(read);
|
||||
for ( final GATKSAMRecord read : toRemoveFromHeader )
|
||||
removeFromHeader(windowHeader, read);
|
||||
|
||||
return hetReads;
|
||||
}
|
||||
|
||||
/*
|
||||
* Finalizes a particular het consensus for the given header representation
|
||||
*
|
||||
* @param header the list of header elements representing the header for the consensus
|
||||
* @param isNegativeStrand does this header represent reads on the negative strand?
|
||||
* @param result list in which to store results
|
||||
*/
|
||||
protected void finalizeHetConsensus(final LinkedList<HeaderElement> header, final boolean isNegativeStrand, final ObjectList<GATKSAMRecord> result) {
|
||||
if ( header.size() > 0 )
|
||||
result.addAll(addToSyntheticReads(header, 0, header.size(), isNegativeStrand ? SyntheticRead.StrandType.NEGATIVE : SyntheticRead.StrandType.POSITIVE));
|
||||
if ( runningConsensus != null )
|
||||
result.add(finalizeRunningConsensus());
|
||||
}
|
||||
|
||||
private void addToHeader(LinkedList<HeaderElement> header, GATKSAMRecord read) {
|
||||
updateHeaderCounts(header, read, false);
|
||||
}
|
||||
|
|
@ -1101,11 +1194,5 @@ public class SlidingWindow {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void removeReadsFromWindow (final ObjectList<GATKSAMRecord> readsToRemove) {
|
||||
for (final GATKSAMRecord read : readsToRemove) {
|
||||
readsInWindow.remove(read);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -76,9 +76,17 @@ import java.util.Iterator;
|
|||
* @since 8/26/11
|
||||
*/
|
||||
public class SyntheticRead {
|
||||
// Rather than storing a separate list for each attribute in SingleBaseInfo, store one list to reduce
|
||||
// memory footprint.
|
||||
// TODO: better name
|
||||
|
||||
/**
|
||||
* The types of strandedness for synthetic reads
|
||||
*/
|
||||
public enum StrandType {
|
||||
POSITIVE,
|
||||
NEGATIVE,
|
||||
STRANDLESS
|
||||
}
|
||||
|
||||
// Rather than storing a separate list for each attribute in SingleBaseInfo, store one list to reduce memory footprint.
|
||||
private static class SingleBaseInfo {
|
||||
byte baseIndexOrdinal; // enum BaseIndex.ordinal
|
||||
byte count;
|
||||
|
|
@ -134,7 +142,7 @@ public class SyntheticRead {
|
|||
private String readName;
|
||||
private int refStart;
|
||||
private boolean hasIndelQualities = false;
|
||||
private boolean isNegativeStrand = false;
|
||||
private StrandType strandType = StrandType.STRANDLESS;
|
||||
|
||||
/**
|
||||
* Full initialization of the running consensus if you have all the information and are ready to
|
||||
|
|
@ -147,7 +155,7 @@ public class SyntheticRead {
|
|||
* @param readName the read's name
|
||||
* @param refStart the alignment start (reference based)
|
||||
*/
|
||||
public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) {
|
||||
public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, StrandType strandType) {
|
||||
final int initialCapacity = 10000;
|
||||
basesCountsQuals = new ObjectArrayList<SingleBaseInfo>(initialCapacity);
|
||||
mappingQuality = 0.0;
|
||||
|
|
@ -159,10 +167,10 @@ public class SyntheticRead {
|
|||
this.readName = readName;
|
||||
this.refStart = refStart;
|
||||
this.hasIndelQualities = hasIndelQualities;
|
||||
this.isNegativeStrand = isNegativeRead;
|
||||
this.strandType = strandType;
|
||||
}
|
||||
|
||||
public SyntheticRead(ObjectArrayList<BaseIndex> bases, ByteArrayList counts, ByteArrayList quals, ByteArrayList insertionQuals, ByteArrayList deletionQuals, double mappingQuality, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) {
|
||||
public SyntheticRead(ObjectArrayList<BaseIndex> bases, ByteArrayList counts, ByteArrayList quals, ByteArrayList insertionQuals, ByteArrayList deletionQuals, double mappingQuality, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, StrandType strandType) {
|
||||
basesCountsQuals = new ObjectArrayList<SingleBaseInfo>(bases.size());
|
||||
for (int i = 0; i < bases.size(); ++i) {
|
||||
basesCountsQuals.add(new SingleBaseInfo(bases.get(i).getOrdinalByte(), counts.get(i), quals.get(i), insertionQuals.get(i), deletionQuals.get(i)));
|
||||
|
|
@ -175,7 +183,7 @@ public class SyntheticRead {
|
|||
this.readName = readName;
|
||||
this.refStart = refStart;
|
||||
this.hasIndelQualities = hasIndelQualities;
|
||||
this.isNegativeStrand = isNegativeRead;
|
||||
this.strandType = strandType;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -216,8 +224,11 @@ public class SyntheticRead {
|
|||
read.setReferenceIndex(contigIndex);
|
||||
read.setReadPairedFlag(false);
|
||||
read.setReadUnmappedFlag(false);
|
||||
read.setReadNegativeStrandFlag(isNegativeStrand);
|
||||
read.setCigar(buildCigar()); // the alignment start may change while building the cigar (leading deletions)
|
||||
if ( strandType != StrandType.STRANDLESS ) {
|
||||
read.setAttribute(GATKSAMRecord.REDUCED_READ_STRANDED_TAG, '1'); // must come before next line
|
||||
read.setReadNegativeStrandFlag(strandType == StrandType.NEGATIVE);
|
||||
}
|
||||
read.setCigar(buildCigar()); // the alignment start may change while building the cigar (leading deletions)
|
||||
read.setAlignmentStart(refStart);
|
||||
read.setReadName(readName);
|
||||
read.setBaseQualities(convertBaseQualities(), EventType.BASE_SUBSTITUTION);
|
||||
|
|
|
|||
|
|
@ -599,9 +599,9 @@ public class UnifiedGenotyperEngine {
|
|||
int numDeletions = 0;
|
||||
for ( final PileupElement p : rawContext.getBasePileup() ) {
|
||||
if ( p.isDeletion() )
|
||||
numDeletions++;
|
||||
numDeletions += p.getRepresentativeCount();
|
||||
}
|
||||
if ( ((double) numDeletions) / ((double) rawContext.getBasePileup().getNumberOfElements()) > UAC.MAX_DELETION_FRACTION ) {
|
||||
if ( ((double) numDeletions) / ((double) rawContext.getBasePileup().depthOfCoverage()) > UAC.MAX_DELETION_FRACTION ) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,175 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.qc;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.filters.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
|
||||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Emits intervals present in either the original or reduced bam but not the other.
|
||||
*
|
||||
* <h3>Input</h3>
|
||||
* <p>
|
||||
* The original and reduced BAM files.
|
||||
* </p>
|
||||
*
|
||||
* <h3>Output</h3>
|
||||
* <p>
|
||||
* A list of intervals present in one bam but not the other.
|
||||
* </p>
|
||||
*
|
||||
* <h3>Examples</h3>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -I:original original.bam \
|
||||
* -I:reduced reduced.bam \
|
||||
* -R ref.fasta \
|
||||
* -T AssessReducedCoverage \
|
||||
* -o output.intervals
|
||||
* </pre>
|
||||
*
|
||||
* @author ebanks
|
||||
*/
|
||||
@DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} )
|
||||
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class})
|
||||
@Hidden
|
||||
public class AssessReducedCoverage extends LocusWalker<GenomeLoc, GenomeLoc> implements TreeReducible<GenomeLoc> {
|
||||
|
||||
private static final String original = "original";
|
||||
private static final String reduced = "reduced";
|
||||
|
||||
@Output
|
||||
protected PrintStream out;
|
||||
|
||||
@Override
|
||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||
|
||||
@Argument(fullName = "output_reduced_only_coverage", shortName = "output_reduced_only_coverage", doc = "Output an interval if the reduced bam has coverage where the original does not", required = false)
|
||||
public boolean OUTPUT_REDUCED_ONLY_INTERVALS = false;
|
||||
|
||||
public void initialize() {}
|
||||
|
||||
public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
|
||||
if ( tracker == null )
|
||||
return null;
|
||||
|
||||
final Set<String> tags = getAllTags(context.getBasePileup());
|
||||
return (tags.contains(original) && !tags.contains(reduced)) ||
|
||||
(OUTPUT_REDUCED_ONLY_INTERVALS && tags.contains(reduced) && !tags.contains(original)) ? ref.getLocus() : null;
|
||||
}
|
||||
|
||||
private Set<String> getAllTags(final ReadBackedPileup pileup) {
|
||||
|
||||
final Set<String> tags = new HashSet<String>(10);
|
||||
|
||||
for ( final PileupElement p : pileup ) {
|
||||
if ( (int)p.getQual() > 2 && p.getMappingQual() > 0 && !p.isDeletion() )
|
||||
tags.addAll(getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags());
|
||||
}
|
||||
|
||||
return tags;
|
||||
}
|
||||
|
||||
public void onTraversalDone(GenomeLoc sum) {
|
||||
if ( sum != null )
|
||||
out.println(sum);
|
||||
}
|
||||
|
||||
public GenomeLoc reduceInit() {
|
||||
return null;
|
||||
}
|
||||
|
||||
public GenomeLoc treeReduce(GenomeLoc lhs, GenomeLoc rhs) {
|
||||
if ( lhs == null )
|
||||
return rhs;
|
||||
|
||||
if ( rhs == null )
|
||||
return lhs;
|
||||
|
||||
// if contiguous, just merge them
|
||||
if ( lhs.contiguousP(rhs) )
|
||||
return getToolkit().getGenomeLocParser().createGenomeLoc(lhs.getContig(), lhs.getStart(), rhs.getStop());
|
||||
|
||||
// otherwise, print the lhs and start over with the rhs
|
||||
out.println(lhs);
|
||||
return rhs;
|
||||
}
|
||||
|
||||
public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) {
|
||||
if ( value == null )
|
||||
return sum;
|
||||
|
||||
if ( sum == null )
|
||||
return value;
|
||||
|
||||
// if contiguous, just merge them
|
||||
if ( sum.contiguousP(value) )
|
||||
return getToolkit().getGenomeLocParser().createGenomeLoc(sum.getContig(), sum.getStart(), value.getStop());
|
||||
|
||||
// otherwise, print the sum and start over with the value
|
||||
out.println(sum);
|
||||
return value;
|
||||
}
|
||||
}
|
||||
|
|
@ -53,6 +53,7 @@ import org.testng.annotations.DataProvider;
|
|||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
public class HeaderElementUnitTest extends BaseTest {
|
||||
|
|
@ -136,10 +137,12 @@ public class HeaderElementUnitTest extends BaseTest {
|
|||
private class AllelesTest {
|
||||
public final int[] counts;
|
||||
public final double proportion;
|
||||
public final boolean allowDeletions;
|
||||
|
||||
private AllelesTest(final int[] counts, final double proportion) {
|
||||
private AllelesTest(final int[] counts, final double proportion, final boolean allowDeletions) {
|
||||
this.counts = counts;
|
||||
this.proportion = proportion;
|
||||
this.allowDeletions = allowDeletions;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -150,12 +153,16 @@ public class HeaderElementUnitTest extends BaseTest {
|
|||
final int[] counts = new int[]{ 0, 5, 10, 15, 20 };
|
||||
final double [] proportions = new double[]{ 0.0, 0.05, 0.10, 0.50, 1.0 };
|
||||
|
||||
for ( final int count1 : counts ) {
|
||||
for ( final int count2 : counts ) {
|
||||
for ( final int count3 : counts ) {
|
||||
for ( final int count4 : counts ) {
|
||||
for ( final double proportion : proportions ) {
|
||||
tests.add(new Object[]{new AllelesTest(new int[]{count1, count2, count3, count4}, proportion)});
|
||||
for ( final int countA : counts ) {
|
||||
for ( final int countC : counts ) {
|
||||
for ( final int countG : counts ) {
|
||||
for ( final int countT : counts ) {
|
||||
for ( final int countD : counts ) {
|
||||
for ( final double proportion : proportions ) {
|
||||
for ( final boolean allowDeletions : Arrays.asList(true, false) ) {
|
||||
tests.add(new Object[]{new AllelesTest(new int[]{countA, countC, countG, countT, countD}, proportion, allowDeletions)});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -170,24 +177,27 @@ public class HeaderElementUnitTest extends BaseTest {
|
|||
|
||||
HeaderElement headerElement = new HeaderElement(1000, 0);
|
||||
for ( int i = 0; i < test.counts.length; i++ ) {
|
||||
BaseIndex base = BaseIndex.values()[i];
|
||||
final BaseIndex base = BaseIndex.values()[i];
|
||||
for ( int j = 0; j < test.counts[i]; j++ )
|
||||
headerElement.addBase(base.b, byte20, byte10, byte10, byte20, minBaseQual, minMappingQual, false);
|
||||
}
|
||||
|
||||
final int nAllelesSeen = headerElement.getNumberOfAlleles(test.proportion);
|
||||
final int nAllelesExpected = calculateExpectedAlleles(test.counts, test.proportion);
|
||||
final int nAllelesSeen = headerElement.getNumberOfAlleles(test.proportion, test.allowDeletions);
|
||||
final int nAllelesExpected = calculateExpectedAlleles(test.counts, test.proportion, test.allowDeletions);
|
||||
|
||||
Assert.assertEquals(nAllelesSeen, nAllelesExpected);
|
||||
}
|
||||
|
||||
private static int calculateExpectedAlleles(final int[] counts, final double proportion) {
|
||||
private static int calculateExpectedAlleles(final int[] counts, final double proportion, final boolean allowDeletions) {
|
||||
double total = 0.0;
|
||||
for ( final int count : counts ) {
|
||||
total += count;
|
||||
}
|
||||
|
||||
final int minCount = (int)(proportion * total);
|
||||
final int minCount = Math.max(1, (int)(proportion * total));
|
||||
|
||||
if ( !allowDeletions && counts[BaseIndex.D.index] >= minCount )
|
||||
return -1;
|
||||
|
||||
int result = 0;
|
||||
for ( final int count : counts ) {
|
||||
|
|
|
|||
|
|
@ -47,12 +47,16 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
public class ReduceReadsIntegrationTest extends WalkerTest {
|
||||
final static String REF = b37KGReference;
|
||||
final static String DBSNP = b37dbSNP132;
|
||||
final String BAM = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam";
|
||||
final String DELETION_BAM = validationDataLocation + "filtered_deletion_for_reduce_reads.bam";
|
||||
final String STASH_BAM = validationDataLocation + "ReduceReadsStashBug.bam";
|
||||
|
|
@ -67,48 +71,128 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
|||
final String BOTH_ENDS_OF_PAIR_IN_VARIANT_REGION_BAM = privateTestDir + "bothEndsOfPairInVariantRegion.bam";
|
||||
final String INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM = privateTestDir + "rr-too-many-insertions.bam";
|
||||
|
||||
private void RRTest(String testName, String args, String md5) {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + " -o %s ";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList(md5));
|
||||
final static String emptyFileMd5 = "d41d8cd98f00b204e9800998ecf8427e";
|
||||
|
||||
protected Pair<List<File>, List<String>> executeTest(final String name, final WalkerTestSpec spec) {
|
||||
final Pair<List<File>, List<String>> result = super.executeTest(name, spec);
|
||||
|
||||
// perform some Reduce Reads specific testing now
|
||||
if ( result != null ) {
|
||||
|
||||
// generate a new command-line based on the old one
|
||||
spec.disableImplicitArgs();
|
||||
final String[] originalArgs = spec.getArgsWithImplicitArgs().split(" ");
|
||||
|
||||
final StringBuilder newArgs = new StringBuilder();
|
||||
for ( int i = 0; i < originalArgs.length; i++ ) {
|
||||
final String arg = originalArgs[i];
|
||||
if ( arg.equals("-T") ) {
|
||||
newArgs.append("-T AssessReducedCoverage ");
|
||||
} else if ( arg.startsWith("-I") ) {
|
||||
newArgs.append("-I:original ");
|
||||
newArgs.append(originalArgs[++i]);
|
||||
newArgs.append(" ");
|
||||
} else if ( arg.equals("-R") || arg.equals("-L") ) {
|
||||
newArgs.append(arg);
|
||||
newArgs.append(" ");
|
||||
newArgs.append(originalArgs[++i]);
|
||||
newArgs.append(" ");
|
||||
}
|
||||
}
|
||||
for ( final File file : result.getFirst() ) {
|
||||
newArgs.append("-I:reduced ");
|
||||
newArgs.append(file.getAbsolutePath());
|
||||
newArgs.append(" ");
|
||||
}
|
||||
newArgs.append("-o %s");
|
||||
|
||||
super.executeTest(name + " : COVERAGE_TEST", new WalkerTestSpec(newArgs.toString(), Arrays.asList(emptyFileMd5)));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
protected Pair<List<File>, List<String>> executeTestWithoutAdditionalRRTests(final String name, final WalkerTestSpec spec) {
|
||||
return super.executeTest(name, spec);
|
||||
}
|
||||
|
||||
private void RRTest(final String testName, final String args, final String md5, final boolean useKnowns) {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + " -o %s" + (useKnowns ? " -known " + DBSNP : "") + " ";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList("bam"), Arrays.asList(md5));
|
||||
executeTest(testName, spec);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testDefaultCompression() {
|
||||
RRTest("testDefaultCompression ", L, "16d97a47b8dbfae4ea64fbdf522b693c");
|
||||
RRTest("testDefaultCompression ", L, "538362abd504200800145720b23c98ce", false);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testInsertionsAtEdgeOfConsensus() {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM) + " -o %s ";
|
||||
executeTest("testInsertionsAtEdgeOfConsensus", new WalkerTestSpec(base, Arrays.asList("f7a9a27c5eaf791b67a768fff960a9e1")));
|
||||
public void testDefaultCompressionWithKnowns() {
|
||||
RRTest("testDefaultCompressionWithKnowns ", L, "79cdbd997196957af63f46353cff710b", true);
|
||||
}
|
||||
|
||||
private final String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110";
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMultipleIntervals() {
|
||||
String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110";
|
||||
RRTest("testMultipleIntervals ", intervals, "8886ba383e21883241b386882e8e5063");
|
||||
RRTest("testMultipleIntervals ", intervals, "6733b25e87e3fce5753cf7936ccf934f", false);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMultipleIntervalsWithKnowns() {
|
||||
RRTest("testMultipleIntervalsWithKnowns ", intervals, "99e2a79befc71eaadb4197c66a0d6df8", true);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testHighCompression() {
|
||||
RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "54253f25d363852a1182aff33e500b92");
|
||||
RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "e3b7e14655973c8950d7fec96321e483", false);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testHighCompressionWithKnowns() {
|
||||
RRTest("testHighCompressionWithKnowns ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "30a7ed079b3a41ed63e520260fa6afe3", true);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testLowCompression() {
|
||||
RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "1d7d2d28900db57dad65a8beef64b8cb");
|
||||
RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "e4cedfcf45cb747e58a7e729eec56de2", false);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testLowCompressionWithKnowns() {
|
||||
RRTest("testLowCompressionWithKnowns ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "e4cedfcf45cb747e58a7e729eec56de2", true);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testIndelCompression() {
|
||||
RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "f58ae2154e0e5716be0e850b7605856e");
|
||||
final String md5 = "f58ae2154e0e5716be0e850b7605856e";
|
||||
RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", md5, false);
|
||||
RRTest("testIndelCompressionWithKnowns ", " -cs 50 -L 20:10,100,500-10,100,600 ", md5, true);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testFilteredDeletionCompression() {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s ";
|
||||
executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("bfe0693aea74634f1035a9bd11302517")));
|
||||
executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("bfe0693aea74634f1035a9bd11302517")));
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testCoReduction() {
|
||||
String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s ";
|
||||
executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("930ec2e2c3b62bec7a2425a82c64f022")));
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testCoReductionWithKnowns() {
|
||||
String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s -known %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B, DBSNP) + " -o %s ";
|
||||
executeTest("testCoReductionWithKnowns", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("fe7c9fd35e50a828e0f38a7ae25b60a7")));
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testInsertionsAtEdgeOfConsensus() {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM) + " -o %s ";
|
||||
executeTest("testInsertionsAtEdgeOfConsensus", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("b4445db7aeddaf2f1d86e1af0cdc74c8")));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -122,7 +206,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
|||
@Test(enabled = true)
|
||||
public void testAddingReadAfterTailingTheStash() {
|
||||
String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s ";
|
||||
executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("f118e83c394d21d901a24230379864fc")));
|
||||
executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("f118e83c394d21d901a24230379864fc")));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -132,26 +216,20 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
|||
@Test(enabled = true)
|
||||
public void testDivideByZero() {
|
||||
String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s ";
|
||||
executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bd5198a3e21034887b741faaaa3964bf")));
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testCoReduction() {
|
||||
String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s ";
|
||||
executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("81312c31b9910a42bff6acb5167592ab")));
|
||||
// we expect to lose coverage due to the downsampling so don't run the systematic coverage test
|
||||
executeTestWithoutAdditionalRRTests("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("bd5198a3e21034887b741faaaa3964bf")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Bug happens when reads are soft-clipped off the contig (usually in the MT). This test guarantees no changes to the upstream code will
|
||||
* Bug happens when reads are soft-clipped off the contig (usually in the MT). This test guarantees no changes to the upstream code will
|
||||
* break the current hard-clipping routine that protects reduce reads from such reads.
|
||||
*/
|
||||
@Test(enabled = true)
|
||||
public void testReadOffContig() {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, OFFCONTIG_BAM) + " -o %s ";
|
||||
executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("b4dc66445ddf5f467f67860bed023ef8")));
|
||||
executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("b4dc66445ddf5f467f67860bed023ef8")));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Confirm that if both ends of pair are in same variant region, compressed names of both ends of pair are the same.
|
||||
*/
|
||||
|
|
@ -159,7 +237,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
|||
public void testPairedReadsInVariantRegion() {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", hg19Reference, BOTH_ENDS_OF_PAIR_IN_VARIANT_REGION_BAM) +
|
||||
" -o %s --downsample_coverage 250 -dcov 50 ";
|
||||
executeTest("testPairedReadsInVariantRegion", new WalkerTestSpec(base, Arrays.asList("9bed260b6245f5ff47db8541405504aa")));
|
||||
executeTest("testPairedReadsInVariantRegion", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("9bed260b6245f5ff47db8541405504aa")));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -49,12 +49,29 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
|||
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
|
||||
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
|
||||
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.refdata.RODRecordListImpl;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
|
||||
|
|
@ -96,7 +113,7 @@ public class ReduceReadsUnitTest extends BaseTest {
|
|||
/**
|
||||
* Test the read name compression functionality
|
||||
*/
|
||||
@Test(dataProvider = "ReadNameProvider")
|
||||
@Test(dataProvider = "ReadNameProvider", enabled = false)
|
||||
public void testReadNameCompression(final String name, final boolean alreadySeen) {
|
||||
GATKSAMRecord read = GATKSAMRecord.createRandomRead(1);
|
||||
read.setReadName(name);
|
||||
|
|
@ -108,4 +125,90 @@ public class ReduceReadsUnitTest extends BaseTest {
|
|||
Assert.assertTrue(hash.containsKey(name));
|
||||
}
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
//// This section tests the functionality related to known SNP positions ////
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
private static SAMFileHeader header;
|
||||
private static GenomeLocParser genomeLocParser;
|
||||
|
||||
@BeforeClass
|
||||
public void beforeClass() {
|
||||
header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, 100);
|
||||
genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
|
||||
}
|
||||
|
||||
@DataProvider(name = "PopulateKnownsProvider")
|
||||
public Object[][] populateKnownsProvider() {
|
||||
|
||||
final Allele A = Allele.create("A", true);
|
||||
final Allele C = Allele.create("C");
|
||||
final Allele G = Allele.create("G");
|
||||
final Allele AC = Allele.create("AC");
|
||||
|
||||
final VariantContext snp_1_10 = new VariantContextBuilder("known", "chr1", 10, 10, Arrays.asList(A, C)).make();
|
||||
final VariantContext snp_1_10_2 = new VariantContextBuilder("known", "chr1", 10, 10, Arrays.asList(A, G)).make();
|
||||
final VariantContext snp_1_20 = new VariantContextBuilder("known", "chr1", 20, 20, Arrays.asList(A, C)).make();
|
||||
final VariantContext snp_1_30 = new VariantContextBuilder("known", "chr1", 30, 30, Arrays.asList(A, C)).make();
|
||||
final VariantContext snp_2_10 = new VariantContextBuilder("known", "chr2", 10, 10, Arrays.asList(A, C)).make();
|
||||
final VariantContext snp_3_10 = new VariantContextBuilder("known", "chr3", 10, 10, Arrays.asList(A, C)).make();
|
||||
final VariantContext indel_1_40 = new VariantContextBuilder("known", "chr1", 40, 40, Arrays.asList(A, AC)).make();
|
||||
final VariantContext indel_2_40 = new VariantContextBuilder("known", "chr2", 40, 40, Arrays.asList(A, AC)).make();
|
||||
|
||||
final GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "foo1", 0, 1, 1);
|
||||
final GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "foo2", 1, 1, 1);
|
||||
final GATKSAMRecord read3 = ArtificialSAMUtils.createArtificialRead(header, "foo3", 2, 1, 1);
|
||||
|
||||
final ObjectArrayList<Object[]> tests = new ObjectArrayList<Object[]>();
|
||||
|
||||
// test single
|
||||
tests.add(new Object[]{1, 1, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10))});
|
||||
|
||||
// test multiple at one position
|
||||
tests.add(new Object[]{1, 1, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_10_2))});
|
||||
|
||||
// test multiple
|
||||
tests.add(new Object[]{3, 3, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30))});
|
||||
|
||||
// test indel not used
|
||||
tests.add(new Object[]{3, 3, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(indel_1_40))});
|
||||
tests.add(new Object[]{3, 3, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(indel_2_40))});
|
||||
|
||||
// test read clears
|
||||
tests.add(new Object[]{3, 0, read2, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30))});
|
||||
tests.add(new Object[]{4, 1, read2, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_2_10))});
|
||||
tests.add(new Object[]{3, 0, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30))});
|
||||
tests.add(new Object[]{4, 0, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_2_10))});
|
||||
tests.add(new Object[]{4, 1, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_3_10))});
|
||||
tests.add(new Object[]{5, 1, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_2_10), makeRefMetaDataTracker(snp_3_10))});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
private final RefMetaDataTracker makeRefMetaDataTracker(final Feature feature) {
|
||||
final List<GATKFeature> x = new ArrayList<GATKFeature>();
|
||||
x.add(new GATKFeature.TribbleGATKFeature(genomeLocParser, feature, "known"));
|
||||
final RODRecordList rods = new RODRecordListImpl("known", x, genomeLocParser.createGenomeLoc(feature.getChr(), feature.getStart(), feature.getEnd()));
|
||||
return new RefMetaDataTracker(Arrays.asList(rods));
|
||||
}
|
||||
|
||||
@Test(dataProvider = "PopulateKnownsProvider")
|
||||
public void testPopulateKnowns(final int expectedSizeBeforeClear, final int expectedSizeAfterClear, final GATKSAMRecord read, final List<RefMetaDataTracker> trackers) {
|
||||
final ReduceReads rr = new ReduceReads();
|
||||
RodBinding.resetNameCounter();
|
||||
rr.known = Arrays.<RodBinding<VariantContext>>asList(new RodBinding(VariantContext.class, "known"));
|
||||
|
||||
final GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
|
||||
engine.setGenomeLocParser(genomeLocParser);
|
||||
rr.setToolkit(engine);
|
||||
|
||||
for ( final RefMetaDataTracker tracker : trackers )
|
||||
rr.populateKnownSNPs(tracker);
|
||||
Assert.assertEquals(rr.knownSnpPositions.size(), expectedSizeBeforeClear);
|
||||
|
||||
rr.clearStaleKnownPositions(read);
|
||||
Assert.assertEquals(rr.knownSnpPositions.size(), expectedSizeAfterClear);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -46,9 +46,7 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
|
||||
import it.unimi.dsi.fastutil.objects.ObjectList;
|
||||
import it.unimi.dsi.fastutil.objects.ObjectSet;
|
||||
import it.unimi.dsi.fastutil.objects.*;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
|
|
@ -317,6 +315,7 @@ public class SlidingWindowUnitTest extends BaseTest {
|
|||
private static final GenomeLoc loc295 = new UnvalidatingGenomeLoc("1", 0, 1000295, 1000295);
|
||||
private static final GenomeLoc loc309 = new UnvalidatingGenomeLoc("1", 0, 1000309, 1000309);
|
||||
private static final GenomeLoc loc310 = new UnvalidatingGenomeLoc("1", 0, 1000310, 1000310);
|
||||
private static final GenomeLoc loc312 = new UnvalidatingGenomeLoc("1", 0, 1000312, 1000312);
|
||||
private static final GenomeLoc loc1100 = new UnvalidatingGenomeLoc("1", 0, 1001100, 1001100);
|
||||
|
||||
@DataProvider(name = "ConsensusCreation")
|
||||
|
|
@ -325,10 +324,11 @@ public class SlidingWindowUnitTest extends BaseTest {
|
|||
|
||||
// test high quality reads and bases
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(), false, false, 1, 1)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290), false, false, 9, 5)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290), false, false, 9, 6)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc295), false, false, 10, 10)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc309), false, false, 10, 10)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc310), false, false, 11, 11)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc312), false, false, 11, 8)});
|
||||
|
||||
// test low quality reads
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(), true, false, 1, 1)});
|
||||
|
|
@ -349,8 +349,7 @@ public class SlidingWindowUnitTest extends BaseTest {
|
|||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc1100), false, true, 3, 3)});
|
||||
|
||||
// test I/D operators
|
||||
// TODO -- uncomment this test when the deletion bug is fixed!
|
||||
// tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290), CigarOperator.D, 9, 5)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290), CigarOperator.D, 9, 9)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc295), CigarOperator.D, 10, 10)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc309), CigarOperator.D, 10, 10)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc310), CigarOperator.D, 11, 11)});
|
||||
|
|
@ -364,23 +363,66 @@ public class SlidingWindowUnitTest extends BaseTest {
|
|||
|
||||
@Test(dataProvider = "ConsensusCreation", enabled = true)
|
||||
public void testConsensusCreationTest(ConsensusCreationTest test) {
|
||||
// test WITHOUT het compression allowed
|
||||
SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false, false);
|
||||
final ObjectAVLTreeSet<GenomeLoc> knownSNPs = new ObjectAVLTreeSet<GenomeLoc>();
|
||||
|
||||
// test WITHOUT het compression
|
||||
SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
for ( final GATKSAMRecord read : test.myReads )
|
||||
slidingWindow.addRead(read);
|
||||
Pair<ObjectSet<GATKSAMRecord>, CompressionStash> result = slidingWindow.close();
|
||||
Pair<ObjectSet<GATKSAMRecord>, CompressionStash> result = slidingWindow.close(knownSNPs); // currently empty
|
||||
|
||||
Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReads);
|
||||
|
||||
// test WITH het compression allowed
|
||||
slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false, true);
|
||||
// test WITH het compression
|
||||
slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
for ( final GATKSAMRecord read : test.myReads )
|
||||
slidingWindow.addRead(read);
|
||||
result = slidingWindow.close();
|
||||
for ( int i = 0; i < 1200; i++ )
|
||||
knownSNPs.add(new UnvalidatingGenomeLoc("1", 0, globalStartPosition + i, globalStartPosition + i));
|
||||
result = slidingWindow.close(knownSNPs);
|
||||
|
||||
Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReadsWithHetCompression);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testConsensusCreationForMultiallelic() {
|
||||
|
||||
final int totalNumReads = 7;
|
||||
final ObjectList<GATKSAMRecord> myReads = new ObjectArrayList<GATKSAMRecord>(totalNumReads);
|
||||
|
||||
for ( int i = 0; i < totalNumReads; i++ ) {
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead" + i, 0, globalStartPosition, readLength);
|
||||
read.setBaseQualities(Utils.dupBytes((byte)30, readLength));
|
||||
read.setMappingQuality(30);
|
||||
read.setReadNegativeStrandFlag(false);
|
||||
|
||||
final char base = i < totalNumReads - 2 ? 'A' : ( i == totalNumReads - 2 ? 'C' : 'G');
|
||||
read.setReadBases(Utils.dupBytes((byte) base, readLength));
|
||||
|
||||
myReads.add(read);
|
||||
}
|
||||
|
||||
final ObjectAVLTreeSet<GenomeLoc> knownSNPs = new ObjectAVLTreeSet<GenomeLoc>();
|
||||
|
||||
// test WITHOUT het compression
|
||||
SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
for ( final GATKSAMRecord read : myReads )
|
||||
slidingWindow.addRead(read);
|
||||
Pair<ObjectSet<GATKSAMRecord>, CompressionStash> result = slidingWindow.close(knownSNPs); // currently empty
|
||||
|
||||
Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all
|
||||
|
||||
// test WITH het compression
|
||||
slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
for ( final GATKSAMRecord read : myReads )
|
||||
slidingWindow.addRead(read);
|
||||
for ( int i = 0; i < readLength; i++ )
|
||||
knownSNPs.add(new UnvalidatingGenomeLoc("1", 0, globalStartPosition + i, globalStartPosition + i));
|
||||
result = slidingWindow.close(knownSNPs);
|
||||
|
||||
Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
//// This section tests the downsampling functionality ////
|
||||
|
|
@ -398,7 +440,7 @@ public class SlidingWindowUnitTest extends BaseTest {
|
|||
|
||||
@Test(dataProvider = "Downsampling", enabled = true)
|
||||
public void testDownsamplingTest(final int dcov) {
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, dcov, ReduceReads.DownsampleStrategy.Normal, false, false);
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, dcov, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
final ObjectList<GATKSAMRecord> result = slidingWindow.downsampleVariantRegion(basicReads);
|
||||
|
||||
Assert.assertEquals(result.size(), Math.min(dcov, basicReads.size()));
|
||||
|
|
@ -446,10 +488,10 @@ public class SlidingWindowUnitTest extends BaseTest {
|
|||
|
||||
@Test(dataProvider = "ConsensusQuals", enabled = true)
|
||||
public void testConsensusQualsTest(QualsTest test) {
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, minUsableConsensusQual, 20, 100, ReduceReads.DownsampleStrategy.Normal, false, false);
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, minUsableConsensusQual, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
for ( final GATKSAMRecord read : test.myReads )
|
||||
slidingWindow.addRead(read);
|
||||
final Pair<ObjectSet<GATKSAMRecord>, CompressionStash> result = slidingWindow.close();
|
||||
final Pair<ObjectSet<GATKSAMRecord>, CompressionStash> result = slidingWindow.close(new ObjectAVLTreeSet<GenomeLoc>());
|
||||
|
||||
Assert.assertEquals(result.getFirst().size(), 1);
|
||||
final GATKSAMRecord read = result.getFirst().iterator().next();
|
||||
|
|
@ -515,7 +557,7 @@ public class SlidingWindowUnitTest extends BaseTest {
|
|||
read.setBaseQualities(Utils.dupBytes((byte) 30, readLength));
|
||||
read.setMappingQuality(30);
|
||||
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false, false);
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
int newIndex = slidingWindow.createNewHeaderElements(windowHeader, read, start);
|
||||
|
||||
Assert.assertEquals(newIndex, start > 0 ? start : 0);
|
||||
|
|
@ -559,7 +601,7 @@ public class SlidingWindowUnitTest extends BaseTest {
|
|||
read.setMappingQuality(30);
|
||||
|
||||
// add the read
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false, false);
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, false, start);
|
||||
for ( int i = 0; i < start; i++ )
|
||||
Assert.assertEquals(windowHeader.get(i).getConsensusBaseCounts().countOfBase(BaseUtils.Base.A.base), 0);
|
||||
|
|
@ -573,4 +615,84 @@ public class SlidingWindowUnitTest extends BaseTest {
|
|||
for ( int i = 0; i < currentHeaderLength; i++ )
|
||||
Assert.assertEquals(windowHeader.get(i).getConsensusBaseCounts().countOfBase(BaseUtils.Base.A.base), 0);
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
//// This section tests functionality related to polyploid consensus creation ////
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@DataProvider(name = "MatchesKnownProvider")
|
||||
public Object[][] matchesKnownProvider() {
|
||||
|
||||
final ObjectArrayList<Object[]> tests = new ObjectArrayList<Object[]>();
|
||||
|
||||
// test no knowns
|
||||
tests.add(new Object[]{new ObjectAVLTreeSet<GenomeLoc>(), loc290.getStart(), false});
|
||||
|
||||
final ObjectSortedSet<GenomeLoc> knownSnpPositions = new ObjectAVLTreeSet<GenomeLoc>();
|
||||
knownSnpPositions.add(loc290);
|
||||
knownSnpPositions.add(loc295);
|
||||
knownSnpPositions.add(loc310);
|
||||
|
||||
// test overlap
|
||||
tests.add(new Object[]{knownSnpPositions, loc290.getStart(), true});
|
||||
tests.add(new Object[]{knownSnpPositions, loc295.getStart(), true});
|
||||
tests.add(new Object[]{knownSnpPositions, loc310.getStart(), true});
|
||||
tests.add(new Object[]{knownSnpPositions, loc309.getStart(), false});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "MatchesKnownProvider")
|
||||
public void testMatchesKnown(final ObjectSortedSet<GenomeLoc> knownSnpPositions, final int targetLoc, final boolean expectedResult) {
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10);
|
||||
Assert.assertEquals(slidingWindow.matchesKnownPosition(targetLoc, knownSnpPositions), expectedResult);
|
||||
}
|
||||
|
||||
@DataProvider(name = "SignificantSoftclipsProvider")
|
||||
public Object[][] SignificantSoftclipsTestData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
for ( final int indexWithSoftclips : Arrays.asList(-1, 0, 5, 9) ) {
|
||||
for ( final int indexToSkip : Arrays.asList(-1, 0, 5, 9) ) {
|
||||
tests.add(new Object[]{indexWithSoftclips, indexToSkip});
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "SignificantSoftclipsProvider", enabled = true)
|
||||
public void significantSoftclipsTest(final int indexWithSoftclips, final int indexToSkip) {
|
||||
|
||||
// set up the window header
|
||||
final int currentHeaderStart = 100;
|
||||
final int currentHeaderLength = 10;
|
||||
final LinkedList<HeaderElement> windowHeader = new LinkedList<HeaderElement>();
|
||||
for ( int i = 0; i < currentHeaderLength; i++ )
|
||||
windowHeader.add(new HeaderElement(currentHeaderStart + i));
|
||||
|
||||
// set up the normal read
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, currentHeaderStart, currentHeaderLength);
|
||||
read.setReadBases(Utils.dupBytes((byte) 'A', currentHeaderLength));
|
||||
read.setBaseQualities(Utils.dupBytes((byte)30, currentHeaderLength));
|
||||
read.setMappingQuality(30);
|
||||
|
||||
// add the read
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, false, 0);
|
||||
|
||||
// set up and add a soft-clipped read if requested
|
||||
if ( indexWithSoftclips != -1 ) {
|
||||
final GATKSAMRecord softclippedRead = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, currentHeaderStart + indexWithSoftclips, 1);
|
||||
softclippedRead.setReadBases(new byte[]{(byte) 'A'});
|
||||
softclippedRead.setBaseQualities(new byte[]{(byte) 30});
|
||||
softclippedRead.setMappingQuality(30);
|
||||
softclippedRead.setCigarString("1S");
|
||||
slidingWindow.actuallyUpdateHeaderForRead(windowHeader, softclippedRead, false, indexWithSoftclips);
|
||||
}
|
||||
|
||||
final boolean result = slidingWindow.hasSignificantSoftclipPosition(windowHeader, currentHeaderStart + indexToSkip);
|
||||
Assert.assertEquals(result, indexWithSoftclips != -1 && indexWithSoftclips != indexToSkip);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -77,7 +77,7 @@ public void testBaseCounts() {
|
|||
new TestRead(bases, quals, new byte[] {1, 127, 51, 126}, new byte [] {1, 126, 50, 125})};
|
||||
|
||||
for (TestRead testRead : testReads) {
|
||||
SyntheticRead syntheticRead = new SyntheticRead(new ObjectArrayList<BaseIndex>(testRead.getBases()), new ByteArrayList(testRead.getCounts()), new ByteArrayList(testRead.getQuals()), new ByteArrayList(testRead.getInsQuals()), new ByteArrayList(testRead.getDelQuals()), artificialMappingQuality, artificialSAMHeader, artificialGATKRG, artificialContig, artificialContigIndex, artificialReadName, artificialRefStart, false, false);
|
||||
SyntheticRead syntheticRead = new SyntheticRead(new ObjectArrayList<BaseIndex>(testRead.getBases()), new ByteArrayList(testRead.getCounts()), new ByteArrayList(testRead.getQuals()), new ByteArrayList(testRead.getInsQuals()), new ByteArrayList(testRead.getDelQuals()), artificialMappingQuality, artificialSAMHeader, artificialGATKRG, artificialContig, artificialContigIndex, artificialReadName, artificialRefStart, false, SyntheticRead.StrandType.STRANDLESS);
|
||||
Assert.assertEquals(syntheticRead.convertBaseCounts(), testRead.getExpectedCounts());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -69,18 +69,18 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testReducedBamSNPs() {
|
||||
testReducedCalling("SNP", "866c19ba60862ad1569d88784423ec8c");
|
||||
testReducedCalling("SNP", "b424779c6609cb727a675bdd301290e6");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReducedBamINDELs() {
|
||||
testReducedCalling("INDEL", "3e01f990c7a7c25fd9e42be559ca2942");
|
||||
testReducedCalling("INDEL", "9a702e7a85465f6c42d6c1828aee6c38");
|
||||
}
|
||||
|
||||
|
||||
private void testReducedCalling(final String model, final String md5) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-11,000,000 -glm " + model, 1,
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-10,500,000 -glm " + model, 1,
|
||||
Arrays.asList(md5));
|
||||
executeTest("test calling on a ReducedRead BAM with " + model, spec);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -52,6 +52,7 @@ import java.util.*;
|
|||
public class GATKSAMRecord extends BAMRecord {
|
||||
// ReduceReads specific attribute tags
|
||||
public static final String REDUCED_READ_CONSENSUS_TAG = "RR"; // marks a synthetic read produced by the ReduceReads tool
|
||||
public static final String REDUCED_READ_STRANDED_TAG = "RS"; // marks a stranded synthetic read produced by the ReduceReads tool
|
||||
public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT = "OP"; // reads that are clipped may use this attribute to keep track of their original alignment start
|
||||
public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT = "OE"; // reads that are clipped may use this attribute to keep track of their original alignment end
|
||||
|
||||
|
|
@ -74,7 +75,7 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
private int softEnd = UNINITIALIZED;
|
||||
private Integer adapterBoundary = null;
|
||||
|
||||
private boolean isStrandlessRead = false;
|
||||
private Boolean isStrandlessRead = null;
|
||||
|
||||
// because some values can be null, we don't want to duplicate effort
|
||||
private boolean retrievedReadGroup = false;
|
||||
|
|
@ -158,6 +159,9 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
* @return true if this read doesn't have meaningful strand information
|
||||
*/
|
||||
public boolean isStrandless() {
|
||||
if ( isStrandlessRead == null ) {
|
||||
isStrandlessRead = isReducedRead() && getCharacterAttribute(REDUCED_READ_STRANDED_TAG) == null;
|
||||
}
|
||||
return isStrandlessRead;
|
||||
}
|
||||
|
||||
|
|
@ -175,7 +179,7 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setReadNegativeStrandFlag(boolean flag) {
|
||||
public void setReadNegativeStrandFlag(final boolean flag) {
|
||||
if ( isStrandless() )
|
||||
throw new IllegalStateException("Cannot set the strand of a strandless read");
|
||||
super.setReadNegativeStrandFlag(flag);
|
||||
|
|
|
|||
|
|
@ -72,7 +72,7 @@ public class CallableLociIntegrationTest extends WalkerTest {
|
|||
public void testWithReducedRead() {
|
||||
String gatk_args = reduceReadArgs + " -L 20:10,000,000-11,000,000 -minDepth 10 -maxDepth 100 --minBaseQuality 10 --minMappingQuality 20 -summary %s";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1,
|
||||
Arrays.asList("684069ffe94a1175051066ed53f0fd9d", "ebc310cf734d98e26d2d83e16b1144d1"));
|
||||
Arrays.asList("69fc303c888fd1fa2937b9518dc82f9e", "f512a85c373087ce03a24ab0f98522c0"));
|
||||
executeTest("CallableLoci with ReducedRead", spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -64,7 +64,6 @@ public class GATKSAMRecordUnitTest extends BaseTest {
|
|||
for (int i = 0; i < reducedRead.getReadLength(); i++) {
|
||||
Assert.assertEquals(reducedRead.getReducedCount(i), REDUCED_READ_COUNTS[i], "Reduced read count not set to the expected value at " + i);
|
||||
}
|
||||
Assert.assertEquals(reducedRead.isStrandless(), false, "Reduced reads don't have meaningful strandedness information");
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
|||
Loading…
Reference in New Issue