diff --git a/build.xml b/build.xml index 2086d0c9a..7c81c1f20 100644 --- a/build.xml +++ b/build.xml @@ -215,7 +215,7 @@ - + diff --git a/ivy.xml b/ivy.xml index ee24bc367..4f41904ba 100644 --- a/ivy.xml +++ b/ivy.xml @@ -76,7 +76,7 @@ - + diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index f954d7650..ede8e9340 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -443,7 +443,7 @@ public class GenomeAnalysisEngine { if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM) throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available."); - if(walker instanceof LocusWalker) { + if(walker instanceof LocusWalker || walker instanceof ActiveRegionWalker) { if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); if(intervals == null) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index 39e1bdc72..eec440820 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -11,7 +11,6 @@ import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.threading.ThreadPoolMonitor; import java.util.Collection; diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index ff5e1064b..774b532f3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.DirectOutputTracker; import org.broadinstitute.sting.gatk.io.OutputTracker; +import org.broadinstitute.sting.gatk.traversals.TraverseActiveRegions; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.SampleUtils; @@ -55,7 +56,6 @@ public class LinearMicroScheduler extends MicroScheduler { traversalEngine.startTimersIfNecessary(); if(shard.getShardType() == Shard.ShardType.LOCUS) { - LocusWalker lWalker = (LocusWalker)walker; WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), getReadIterator(shard), shard.getGenomeLocs(), SampleUtils.getSAMFileSamples(engine)); for(WindowMaker.WindowMakerIterator iterator: windowMaker) { @@ -77,6 +77,12 @@ public class LinearMicroScheduler extends MicroScheduler { done = walker.isDone(); } + // Special function call to empty out the work queue. Ugly for now but will be cleaned up when we push this functionality more into the engine + if( traversalEngine instanceof TraverseActiveRegions ) { + final Object result = ((TraverseActiveRegions) traversalEngine).endTraversal(walker, accumulator.getReduceInit()); + accumulator.accumulate(null, result); // Assumes only used with StandardAccumulator + } + Object result = accumulator.finishTraversal(); printOnTraversalDone(result); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index d013db7e8..508099708 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -128,6 +128,8 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { traversalEngine = new TraverseDuplicates(); } else if (walker instanceof ReadPairWalker) { traversalEngine = new TraverseReadPairs(); + } else if (walker instanceof ActiveRegionWalker) { + traversalEngine = new TraverseActiveRegions(); } else { throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type."); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java new file mode 100644 index 000000000..01bfe396a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -0,0 +1,213 @@ +package org.broadinstitute.sting.gatk.traversals; + +import net.sf.samtools.SAMRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.WalkerManager; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.providers.*; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.LinkedList; +import java.util.Queue; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: 12/9/11 + */ + +public class TraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> { + /** + * our log, which we want to capture anything from this class + */ + protected static Logger logger = Logger.getLogger(TraversalEngine.class); + + private final Queue workQueue = new LinkedList(); + private final LinkedHashSet myReads = new LinkedHashSet(); + + @Override + protected String getTraversalType() { + return "active regions"; + } + + @Override + public T traverse( final ActiveRegionWalker walker, + final LocusShardDataProvider dataProvider, + T sum) { + logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider)); + + LocusView locusView = getLocusView( walker, dataProvider ); + + int minStart = Integer.MAX_VALUE; + final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); + + if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all + + final ArrayList isActiveList = new ArrayList(); + + //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); + ReferenceOrderedView referenceOrderedDataView = null; + if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) + referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider ); + else + referenceOrderedDataView = (RodLocusView)locusView; + + // We keep processing while the next reference location is within the interval + while( locusView.hasNext() ) { + final AlignmentContext locus = locusView.next(); + GenomeLoc location = locus.getLocation(); + + dataProvider.getShard().getReadMetrics().incrementNumIterations(); + + if ( locus.hasExtendedEventPileup() ) { + // if the alignment context we received holds an "extended" pileup (i.e. pileup of insertions/deletions + // associated with the current site), we need to update the location. The updated location still starts + // at the current genomic position, but it has to span the length of the longest deletion (if any). + location = engine.getGenomeLocParser().setStop(location,location.getStop()+locus.getExtendedEventPileup().getMaxDeletionLength()); + + // it is possible that the new expanded location spans the current shard boundary; the next method ensures + // that when it is the case, the reference sequence held by the ReferenceView will be reloaded so that + // the view has all the bases we are gonna need. If the location fits within the current view bounds, + // the next call will not do anything to the view: + referenceView.expandBoundsToAccomodateLoc(location); + } + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); + + // Call the walkers isActive function for this locus and add them to the list to be integrated later + final boolean isActive = walker.isActive( tracker, refContext, locus ); + isActiveList.add( new ActiveRegion(location, isActive, engine.getGenomeLocParser()) ); + + // Grab all the previously unseen reads from this pileup and add them to the massive read list + for( final PileupElement p : locus.getBasePileup() ) { + final SAMRecord read = p.getRead(); + if( !myReads.contains(read) ) { + myReads.add(read); + } + } + + // If this is the last pileup for this shard then need to calculate the minimum alignment start so that + // we know which active regions in the work queue are now safe to process + if( !locusView.hasNext() ) { + for( final PileupElement p : locus.getBasePileup() ) { + final SAMRecord read = p.getRead(); + if( read.getAlignmentStart() < minStart ) { minStart = read.getAlignmentStart(); } + } + } + printProgress(dataProvider.getShard(),locus.getLocation()); + } + + // Take the individual isActive calls and integrate them into contiguous active regions and + // add these blocks of work to the work queue + final ArrayList activeRegions = integrateActiveList( isActiveList ); + logger.debug("Integrated " + isActiveList.size() + " isActive calls into " + activeRegions.size() + " regions." ); + workQueue.addAll( activeRegions ); + } + + while( workQueue.peek().getLocation().getStop() < minStart ) { + final ActiveRegion activeRegion = workQueue.remove(); + sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker ); + } + + return sum; + } + + // Special function called in LinearMicroScheduler to empty out the work queue. Ugly for now but will be cleaned up when we push this functionality more into the engine + public T endTraversal( final Walker walker, T sum) { + while( workQueue.peek() != null ) { + final ActiveRegion activeRegion = workQueue.remove(); + sum = processActiveRegion( activeRegion, myReads, workQueue, sum, (ActiveRegionWalker) walker ); + } + + return sum; + } + + private T processActiveRegion( final ActiveRegion activeRegion, final LinkedHashSet reads, final Queue workQueue, final T sum, final ActiveRegionWalker walker ) { + final ArrayList placedReads = new ArrayList(); + for( final SAMRecord read : reads ) { + final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); + if( activeRegion.getLocation().overlapsP( readLoc ) ) { + // The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region) + long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc ); + ActiveRegion bestRegion = activeRegion; + for( final ActiveRegion otherRegionToTest : workQueue ) { + if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) { + maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap(readLoc); + bestRegion = otherRegionToTest; + } + } + bestRegion.add( (GATKSAMRecord) read, true ); + + // The read is also added to all other region in which it overlaps but marked as non-primary + if( !bestRegion.equals(activeRegion) ) { + activeRegion.add( (GATKSAMRecord) read, false ); + } + for( final ActiveRegion otherRegionToTest : workQueue ) { + if( !bestRegion.equals(otherRegionToTest) && otherRegionToTest.getLocation().overlapsP( readLoc ) ) { + activeRegion.add( (GATKSAMRecord) read, false ); + } + } + placedReads.add( read ); + } + } + reads.removeAll( placedReads ); // remove all the reads which have been placed into their active region + + logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLocation()); + final M x = walker.map( activeRegion, null ); // BUGBUG: tracker needs to be filled in and passed to the walker + return walker.reduce( x, sum ); + } + + /** + * Gets the best view of loci for this walker given the available data. + * @param walker walker to interrogate. + * @param dataProvider Data which which to drive the locus view. + * @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal. + */ + private LocusView getLocusView( Walker walker, LocusShardDataProvider dataProvider ) { + DataSource dataSource = WalkerManager.getWalkerDataSource(walker); + if( dataSource == DataSource.READS ) + return new CoveredLocusView(dataProvider); + else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers ) + return new AllLocusView(dataProvider); + else if( dataSource == DataSource.REFERENCE_ORDERED_DATA ) + return new RodLocusView(dataProvider); + else + throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource); + } + + // integrate active regions into contiguous chunks based on active status + private ArrayList integrateActiveList( final ArrayList activeList ) { + final ArrayList returnList = new ArrayList(); + ActiveRegion prevLocus = activeList.remove(0); + ActiveRegion startLocus = prevLocus; + for( final ActiveRegion thisLocus : activeList ) { + if( prevLocus.isActive != thisLocus.isActive ) { + returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()), + prevLocus.isActive, engine.getGenomeLocParser() ) ); + startLocus = thisLocus; + } + prevLocus = thisLocus; + } + // output the last region if necessary + if( startLocus != prevLocus ) { + returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()), + prevLocus.isActive, engine.getGenomeLocParser() ) ); + } + return returnList; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java new file mode 100644 index 000000000..d2891c959 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java @@ -0,0 +1,29 @@ +package org.broadinstitute.sting.gatk.walkers; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: 12/7/11 + */ + +@By(DataSource.READS) +@Requires({DataSource.READS, DataSource.REFERENCE_BASES}) +@PartitionBy(PartitionType.READ) +public abstract class ActiveRegionWalker extends Walker { + // Do we actually want to operate on the context? + public boolean filter(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { + return true; // We are keeping all the reads + } + + // Determine active status over the AlignmentContext + public abstract boolean isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context); + + // Map over the ActiveRegion + public abstract MapType map(final ActiveRegion activeRegion, final ReadMetaDataTracker metaDataTracker); +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java index 5d215603a..6c860fce6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java @@ -56,7 +56,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio // We refuse to parse SnpEff output files generated by unsupported versions, or // lacking a SnpEff version number in the VCF header: - public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.4" }; + public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.5" }; public static final String SNPEFF_VCF_HEADER_VERSION_LINE_KEY = "SnpEffVersion"; public static final String SNPEFF_VCF_HEADER_COMMAND_LINE_KEY = "SnpEffCmd"; @@ -204,11 +204,6 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio } public void initialize ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit, Set headerLines ) { - throw new UserException("SnpEff support is currently disabled in the GATK until SnpEff 2.0.4 is officially released " + - "due to a serious issue with SnpEff versions prior to 2.0.4. Please see this page for more details: " + - "http://www.broadinstitute.org/gsa/wiki/index.php/Adding_Genomic_Annotations_Using_SnpEff_and_VariantAnnotator"); - - /* // Make sure that we actually have a valid SnpEff rod binding (just in case the user specified -A SnpEff // without providing a SnpEff rod via --snpEffFile): validateRodBinding(walker.getSnpEffRodBinding()); @@ -228,7 +223,6 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio // mistaken in the future for a SnpEff output file: headerLines.add(new VCFHeaderLine(OUTPUT_VCF_HEADER_VERSION_LINE_KEY, snpEffVersionLine.getValue())); headerLines.add(new VCFHeaderLine(OUTPUT_VCF_HEADER_COMMAND_LINE_KEY, snpEffCommandLine.getValue())); - */ } public Map annotate ( RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java index 295cf8688..ae7077230 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java @@ -275,19 +275,22 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { byte obsBase = elt.getBase(); byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); + if ( qual == 0 ) + return 0; if ( elt.isReducedRead() ) { // reduced read representation if ( BaseUtils.isRegularBase( obsBase )) { - add(obsBase, qual, (byte)0, (byte)0, elt.getRepresentativeCount()); // fast calculation of n identical likelihoods - return elt.getRepresentativeCount(); // we added nObs bases here + int representativeCount = elt.getRepresentativeCount(); + add(obsBase, qual, (byte)0, (byte)0, representativeCount); // fast calculation of n identical likelihoods + return representativeCount; // we added nObs bases here } // odd bases or deletions => don't use them return 0; } - return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0; + return add(obsBase, qual, (byte)0, (byte)0, 1); } public int add(List overlappingPair, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { @@ -519,7 +522,7 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { if ( qual > SAMUtils.MAX_PHRED_SCORE ) throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName())); if ( capBaseQualsAtMappingQual ) - qual = (byte)Math.min((int)p.getQual(), p.getMappingQual()); + qual = (byte)Math.min((int)qual, p.getMappingQual()); if ( (int)qual < minBaseQual ) qual = (byte)0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index b30a25414..ace780dd0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -31,6 +31,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -72,25 +73,28 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { this.logger = logger; } - /** - * Must be overridden by concrete subclasses - * - * @param tracker rod data - * @param ref reference context - * @param contexts stratified alignment contexts - * @param contextType stratified context type - * @param priors priors to use for GLs - * @param alternateAlleleToUse the alternate allele to use, null if not set - * @param useBAQedPileup should we use the BAQed pileup or the raw one? - * @return variant context where genotypes are no-called but with GLs - */ - public abstract VariantContext getLikelihoods(RefMetaDataTracker tracker, - ReferenceContext ref, - Map contexts, - AlignmentContextUtils.ReadOrientation contextType, - GenotypePriors priors, - Allele alternateAlleleToUse, - boolean useBAQedPileup); + /** + * Can be overridden by concrete subclasses + * + * @param tracker rod data + * @param ref reference context + * @param contexts stratified alignment contexts + * @param contextType stratified context type + * @param priors priors to use for GLs + * @param alternateAlleleToUse the alternate allele to use, null if not set + * @param useBAQedPileup should we use the BAQed pileup or the raw one? + * @param locParser Genome Loc Parser + * @return variant context where genotypes are no-called but with GLs + */ + public abstract VariantContext getLikelihoods(RefMetaDataTracker tracker, + ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType, + GenotypePriors priors, + Allele alternateAlleleToUse, + boolean useBAQedPileup, + GenomeLocParser locParser); + protected int getFilteredDepth(ReadBackedPileup pileup) { int count = 0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index fe2086d47..0756caf03 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; @@ -54,17 +55,17 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood private final boolean getAlleleListFromVCF; private boolean DEBUG = false; - + private final boolean doMultiAllelicCalls; private boolean ignoreSNPAllelesWhenGenotypingIndels = false; - + private final int maxAlternateAlleles; private PairHMMIndelErrorModel pairModel; private static ThreadLocal>> indelLikelihoodMap = new ThreadLocal>>() { - protected synchronized HashMap> initialValue() { - return new HashMap>(); - } - }; + protected synchronized HashMap> initialValue() { + return new HashMap>(); + } + }; private LinkedHashMap haplotypeMap; @@ -81,12 +82,14 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY, - UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.BANDED_INDEL_COMPUTATION); + UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); alleleList = new ArrayList(); getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING; HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE; DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; + maxAlternateAlleles = UAC.MAX_ALTERNATE_ALLELES; + doMultiAllelicCalls = UAC.MULTI_ALLELIC; haplotypeMap = new LinkedHashMap(); ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES; @@ -95,7 +98,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood private ArrayList computeConsensusAlleles(ReferenceContext ref, Map contexts, - AlignmentContextUtils.ReadOrientation contextType) { + AlignmentContextUtils.ReadOrientation contextType, GenomeLocParser locParser) { Allele refAllele=null, altAllele=null; GenomeLoc loc = ref.getLocus(); ArrayList aList = new ArrayList(); @@ -114,7 +117,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping) return aList; - + for ( Map.Entry sample : contexts.entrySet() ) { // todo -- warning, can be duplicating expensive partition here AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); @@ -126,9 +129,9 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood for ( ExtendedEventPileupElement p : indelPileup.toExtendedIterable() ) { //SAMRecord read = p.getRead(); - GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); + GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); if (read == null) - continue; + continue; if(ReadUtils.is454Read(read)) { continue; } @@ -208,63 +211,69 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood } } -/* if (DEBUG) { - int icount = indelPileup.getNumberOfInsertions(); - int dcount = indelPileup.getNumberOfDeletions(); - if (icount + dcount > 0) - { - List> eventStrings = indelPileup.getEventStringsWithCounts(ref.getBases()); - System.out.format("#ins: %d, #del:%d\n", insCount, delCount); - - for (int i=0 ; i < eventStrings.size() ; i++ ) { - System.out.format("%s:%d,",eventStrings.get(i).first,eventStrings.get(i).second); - // int k=0; - } - System.out.println(); - } - } */ } + Collection vcs = new ArrayList(); int maxAlleleCnt = 0; String bestAltAllele = ""; + for (String s : consensusIndelStrings.keySet()) { - int curCnt = consensusIndelStrings.get(s); - if (curCnt > maxAlleleCnt) { - maxAlleleCnt = curCnt; - bestAltAllele = s; + int curCnt = consensusIndelStrings.get(s), stop = 0; + // if observed count if above minimum threshold, we will genotype this allele + if (curCnt < minIndelCountForGenotyping) + continue; + + if (s.startsWith("D")) { + // get deletion length + int dLen = Integer.valueOf(s.substring(1)); + // get ref bases of accurate deletion + int startIdxInReference = 1+loc.getStart()-ref.getWindow().getStart(); + stop = loc.getStart() + dLen; + byte[] refBases = Arrays.copyOfRange(ref.getBases(),startIdxInReference,startIdxInReference+dLen); + + if (Allele.acceptableAlleleBases(refBases)) { + refAllele = Allele.create(refBases,true); + altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); + } + } + else { + // insertion case + if (Allele.acceptableAlleleBases(s)) { + refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); + altAllele = Allele.create(s, false); + stop = loc.getStart(); + } } -// if (DEBUG) -// System.out.format("Key:%s, number: %d\n",s,consensusIndelStrings.get(s) ); - } //gdebug- - if (maxAlleleCnt < minIndelCountForGenotyping) - return aList; - if (bestAltAllele.startsWith("D")) { - // get deletion length - int dLen = Integer.valueOf(bestAltAllele.substring(1)); - // get ref bases of accurate deletion - int startIdxInReference = 1+loc.getStart()-ref.getWindow().getStart(); + ArrayList vcAlleles = new ArrayList(); + vcAlleles.add(refAllele); + vcAlleles.add(altAllele); - //System.out.println(new String(ref.getBases())); - byte[] refBases = Arrays.copyOfRange(ref.getBases(),startIdxInReference,startIdxInReference+dLen); + final VariantContextBuilder builder = new VariantContextBuilder().source(""); + builder.loc(loc.getContig(), loc.getStart(), stop); + builder.alleles(vcAlleles); + builder.referenceBaseForIndel(ref.getBase()); + builder.noGenotypes(); + if (doMultiAllelicCalls) + vcs.add(builder.make()); + else { + if (curCnt > maxAlleleCnt) { + maxAlleleCnt = curCnt; + vcs.clear(); + vcs.add(builder.make()); + } - if (Allele.acceptableAlleleBases(refBases)) { - refAllele = Allele.create(refBases,true); - altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); } } - else { - // insertion case - if (Allele.acceptableAlleleBases(bestAltAllele)) { - refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); - altAllele = Allele.create(bestAltAllele, false); - } - } - if (refAllele != null && altAllele != null) { - aList.add(0,refAllele); - aList.add(1,altAllele); - } + + if (vcs.isEmpty()) + return aList; // nothing else to do, no alleles passed minimum count criterion + + VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false); + + aList = new ArrayList(mergedVC.getAlleles()); + return aList; } @@ -277,7 +286,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood AlignmentContextUtils.ReadOrientation contextType, GenotypePriors priors, Allele alternateAlleleToUse, - boolean useBAQedPileup) { + boolean useBAQedPileup, GenomeLocParser locParser) { if ( tracker == null ) return null; @@ -294,17 +303,17 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood haplotypeMap.clear(); if (getAlleleListFromVCF) { - for( final VariantContext vc_input : tracker.getValues(UAC.alleles, loc) ) { - if( vc_input != null && - allowableTypes.contains(vc_input.getType()) && - ref.getLocus().getStart() == vc_input.getStart()) { - vc = vc_input; - break; - } - } - // ignore places where we don't have a variant - if ( vc == null ) - return null; + for( final VariantContext vc_input : tracker.getValues(UAC.alleles, loc) ) { + if( vc_input != null && + allowableTypes.contains(vc_input.getType()) && + ref.getLocus().getStart() == vc_input.getStart()) { + vc = vc_input; + break; + } + } + // ignore places where we don't have a variant + if ( vc == null ) + return null; alleleList.clear(); if (ignoreSNPAllelesWhenGenotypingIndels) { @@ -323,7 +332,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood } else { - alleleList = computeConsensusAlleles(ref,contexts, contextType); + alleleList = computeConsensusAlleles(ref,contexts, contextType, locParser); if (alleleList.isEmpty()) return null; } @@ -340,7 +349,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood if (alleleList.isEmpty()) return null; - + refAllele = alleleList.get(0); altAllele = alleleList.get(1); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 57cc5594a..81c766e4d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -30,10 +30,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.StingException; @@ -46,8 +43,6 @@ import java.util.*; public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { - private static final int MIN_QUAL_SUM_FOR_ALT_ALLELE = 50; - private boolean ALLOW_MULTIPLE_ALLELES; private final boolean useAlleleFromVCF; @@ -56,15 +51,20 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC super(UAC, logger); ALLOW_MULTIPLE_ALLELES = UAC.MULTI_ALLELIC; useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; + + // make sure the PL cache has been initialized with enough alleles + if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null || UnifiedGenotyperEngine.PLIndexToAlleleIndex.length < 4 ) // +1 for 0 alt alleles + UnifiedGenotyperEngine.calculatePLcache(3); } - public VariantContext getLikelihoods(RefMetaDataTracker tracker, - ReferenceContext ref, - Map contexts, - AlignmentContextUtils.ReadOrientation contextType, - GenotypePriors priors, - Allele alternateAlleleToUse, - boolean useBAQedPileup) { + public VariantContext getLikelihoods(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final Map contexts, + final AlignmentContextUtils.ReadOrientation contextType, + final GenotypePriors priors, + final Allele alternateAlleleToUse, + final boolean useBAQedPileup, + final GenomeLocParser locParser) { if ( !(priors instanceof DiploidSNPGenotypePriors) ) throw new StingException("Only diploid-based SNP priors are supported in the SNP GL model"); @@ -79,6 +79,20 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC alleles.add(Allele.create(refBase, true)); final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles); + // calculate the GLs + ArrayList GLs = new ArrayList(contexts.size()); + for ( Map.Entry sample : contexts.entrySet() ) { + ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup(); + if ( useBAQedPileup ) + pileup = createBAQedPileup( pileup ); + + // create the GenotypeLikelihoods object + final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods((DiploidSNPGenotypePriors)priors, UAC.PCR_error); + final int nGoodBases = GL.add(pileup, true, true, UAC.MIN_BASE_QUALTY_SCORE); + if ( nGoodBases > 0 ) + GLs.add(new SampleGenotypeData(sample.getKey(), GL, getFilteredDepth(pileup))); + } + // find the alternate allele(s) that we should be using if ( alternateAlleleToUse != null ) { basesToUse[BaseUtils.simpleBaseToBaseIndex(alternateAlleleToUse.getBases()[0])] = true; @@ -93,7 +107,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC basesToUse[BaseUtils.simpleBaseToBaseIndex(allele.getBases()[0])] = true; } else { - determineAlternateAlleles(basesToUse, refBase, contexts, useBAQedPileup); + determineAlternateAlleles(basesToUse, refBase, GLs); // how many alternate alleles are we using? int alleleCounter = Utils.countSetBits(basesToUse); @@ -125,22 +139,12 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC builder.alleles(alleles); // create the genotypes; no-call everyone for now - GenotypesContext genotypes = GenotypesContext.create(); + final GenotypesContext genotypes = GenotypesContext.create(); final List noCall = new ArrayList(); noCall.add(Allele.NO_CALL); - for ( Map.Entry sample : contexts.entrySet() ) { - ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup(); - if ( useBAQedPileup ) - pileup = createBAQedPileup( pileup ); - - // create the GenotypeLikelihoods object - final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods((DiploidSNPGenotypePriors)priors, UAC.PCR_error); - final int nGoodBases = GL.add(pileup, true, true, UAC.MIN_BASE_QUALTY_SCORE); - if ( nGoodBases == 0 ) - continue; - - final double[] allLikelihoods = GL.getLikelihoods(); + for ( SampleGenotypeData sampleData : GLs ) { + final double[] allLikelihoods = sampleData.GL.getLikelihoods(); final double[] myLikelihoods = new double[numLikelihoods]; int myLikelihoodsIndex = 0; @@ -151,60 +155,48 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC } // normalize in log space so that max element is zero. - GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(myLikelihoods, false, true)); + final GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(myLikelihoods, false, true)); - HashMap attributes = new HashMap(); - attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(pileup)); + final HashMap attributes = new HashMap(); + attributes.put(VCFConstants.DEPTH_KEY, sampleData.depth); attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods); - genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false)); + genotypes.add(new Genotype(sampleData.name, noCall, Genotype.NO_LOG10_PERROR, null, attributes, false)); } return builder.genotypes(genotypes).make(); } // fills in the allelesToUse array - protected void determineAlternateAlleles(boolean[] allelesToUse, byte ref, Map contexts, boolean useBAQedPileup) { - int[] qualCounts = new int[4]; + protected void determineAlternateAlleles(final boolean[] allelesToUse, final byte ref, final List sampleDataList) { - for ( Map.Entry sample : contexts.entrySet() ) { - // calculate the sum of quality scores for each base - ReadBackedPileup pileup = useBAQedPileup ? createBAQedPileup( sample.getValue().getBasePileup() ) : sample.getValue().getBasePileup(); - for ( PileupElement p : pileup ) { - // ignore deletions - if ( p.isDeletion() || (!p.isReducedRead() && p.getQual() < UAC.MIN_BASE_QUALTY_SCORE) ) - continue; + final int baseIndexOfRef = BaseUtils.simpleBaseToBaseIndex(ref); + final int PLindexOfRef = DiploidGenotype.createDiploidGenotype(ref, ref).ordinal(); + final double[] likelihoodCounts = new double[4]; - final int index = BaseUtils.simpleBaseToBaseIndex(p.getBase()); - if ( index >= 0 ) { - qualCounts[index] += p.getQual(); - } + // based on the GLs, find the alternate alleles with the most probability + for ( SampleGenotypeData sampleData : sampleDataList ) { + final double[] likelihoods = sampleData.GL.getLikelihoods(); + final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); + if ( PLindexOfBestGL != PLindexOfRef ) { + int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[3][PLindexOfBestGL]; + if ( alleles[0] != baseIndexOfRef ) + likelihoodCounts[alleles[0]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef]; + // don't double-count it + if ( alleles[1] != baseIndexOfRef && alleles[1] != alleles[0] ) + likelihoodCounts[alleles[1]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef]; } } if ( ALLOW_MULTIPLE_ALLELES ) { - for ( byte altAllele : BaseUtils.BASES ) { - if ( altAllele == ref ) - continue; - int index = BaseUtils.simpleBaseToBaseIndex(altAllele); - if ( qualCounts[index] >= MIN_QUAL_SUM_FOR_ALT_ALLELE ) { - allelesToUse[index] = true; + for ( int i = 0; i < 4; i++ ) { + if ( likelihoodCounts[i] > 0.0 ) { + allelesToUse[i] = true; } } } else { - // set the non-ref base which has the maximum quality score sum - int maxCount = 0; - int indexOfMax = 0; - for ( byte altAllele : BaseUtils.BASES ) { - if ( altAllele == ref ) - continue; - int index = BaseUtils.simpleBaseToBaseIndex(altAllele); - if ( qualCounts[index] > maxCount ) { - maxCount = qualCounts[index]; - indexOfMax = index; - } - } - - if ( maxCount > 0 ) + // set the non-ref base which has the maximum sum of non-ref GLs + final int indexOfMax = MathUtils.maxElementIndex(likelihoodCounts); + if ( likelihoodCounts[indexOfMax] > 0.0 ) allelesToUse[indexOfMax] = true; } } @@ -227,4 +219,16 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC public byte getQual( final int offset ) { return BAQ.calcBAQFromTag(getRead(), offset, true); } } -} \ No newline at end of file + private static class SampleGenotypeData { + + public final String name; + public final DiploidSNPGenotypeLikelihoods GL; + public final int depth; + + public SampleGenotypeData(final String name, final DiploidSNPGenotypeLikelihoods GL, final int depth) { + this.name = name; + this.GL = GL; + this.depth = depth; + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 5713432b4..16159393f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -109,7 +109,7 @@ public class UnifiedArgumentCollection { * For advanced users only. */ @Advanced - @Argument(fullName = "multiallelic", shortName = "multiallelic", doc = "Allow the discovery of multiple alleles (SNPs only)", required = false) + @Argument(fullName = "multiallelic", shortName = "multiallelic", doc = "Allow the discovery of multiple alleles", required = false) public boolean MULTI_ALLELIC = false; /** @@ -146,8 +146,8 @@ public class UnifiedArgumentCollection { public int INDEL_HAPLOTYPE_SIZE = 80; @Hidden - @Argument(fullName = "bandedIndel", shortName = "bandedIndel", doc = "Banded Indel likelihood computation", required = false) - public boolean BANDED_INDEL_COMPUTATION = false; + @Argument(fullName = "noBandedIndel", shortName = "noBandedIndel", doc = "Don't do Banded Indel likelihood computation", required = false) + public boolean DONT_DO_BANDED_INDEL_COMPUTATION = false; @Hidden @Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false) @@ -184,7 +184,7 @@ public class UnifiedArgumentCollection { // todo- arguments to remove uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES; - uac.BANDED_INDEL_COMPUTATION = BANDED_INDEL_COMPUTATION; + uac.DONT_DO_BANDED_INDEL_COMPUTATION = DONT_DO_BANDED_INDEL_COMPUTATION; uac.MULTI_ALLELIC = MULTI_ALLELIC; return uac; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index aa33d39e3..5d73e8d28 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -243,7 +243,7 @@ public class UnifiedGenotyperEngine { glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC)); } - return glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), alternateAlleleToUse, useBAQedPileup && BAQEnabledOnCMDLine); + return glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), alternateAlleleToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser); } private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, AlignmentContext rawContext) { @@ -858,4 +858,171 @@ public class UnifiedGenotyperEngine { return calls; } + + /** + * @param vc variant context with genotype likelihoods + * @param allelesToUse bit vector describing which alternate alleles from the vc are okay to use + * @param exactAC integer array describing the AC from the exact model for the corresponding alleles + * @return genotypes + */ + public static GenotypesContext constrainedAssignGenotypes(VariantContext vc, boolean[] allelesToUse, int[] exactAC ) { + + final GenotypesContext GLs = vc.getGenotypes(); + + // samples + final List sampleIndices = GLs.getSampleNamesOrderedByName(); + + // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward + final int numOriginalAltAlleles = allelesToUse.length; + final List newAlleles = new ArrayList(numOriginalAltAlleles+1); + newAlleles.add(vc.getReference()); + final HashMap alleleIndexMap = new HashMap(); // need this for skipping dimensions + int[] alleleCount = new int[exactAC.length]; + for ( int i = 0; i < numOriginalAltAlleles; i++ ) { + if ( allelesToUse[i] ) { + newAlleles.add(vc.getAlternateAllele(i)); + alleleIndexMap.put(vc.getAlternateAllele(i),i); + alleleCount[i] = exactAC[i]; + } else { + alleleCount[i] = 0; + } + } + final List newAltAlleles = newAlleles.subList(1,newAlleles.size()); + final int numNewAltAlleles = newAltAlleles.size(); + ArrayList likelihoodIndexesToUse = null; + + // an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles, + // then we can keep the PLs as is; otherwise, we determine which ones to keep + final int[][] PLcache; + if ( numNewAltAlleles != numOriginalAltAlleles && numNewAltAlleles > 0 ) { + likelihoodIndexesToUse = new ArrayList(30); + PLcache = PLIndexToAlleleIndex[numOriginalAltAlleles]; + + for ( int PLindex = 0; PLindex < PLcache.length; PLindex++ ) { + int[] alleles = PLcache[PLindex]; + // consider this entry only if both of the alleles are good + if ( (alleles[0] == 0 || allelesToUse[alleles[0] - 1]) && (alleles[1] == 0 || allelesToUse[alleles[1] - 1]) ) + likelihoodIndexesToUse.add(PLindex); + } + } else { + PLcache = PLIndexToAlleleIndex[numOriginalAltAlleles]; + } + + // set up the trellis dimensions + // SAMPLE x alt 1 x alt 2 x alt 3 + // todo -- check that exactAC has alt counts at [1],[2],[3] (and not [0],[1],[2]) + double[][][][] transitionTrellis = new double[sampleIndices.size()+1][exactAC[1]][exactAC[2]][exactAC[3]]; + // N x AC1 x AC2 x AC3; worst performance in multi-allelic where all alleles are moderate frequency + // capped at the MLE ACs* + // todo -- there's an optimization: not all states in the rectangular matrix will be reached, in fact + // todo -- for tT[0] we only care about tT[0][0][0][0], and for tT[1], only combinations of 0,1,2. + int idx = 1; // index of which sample we're on + int prevMaxState = 0; // the maximum state (e.g. AC) reached by the previous sample. Symmetric. (AC capping handled by logic in loop) + // iterate over each sample + for ( String sample : sampleIndices ) { + // push the likelihoods into the next possible states, that is to say + // L[state] = L[prev state] + L[genotype getting into state] + // iterate over each previous state, by dimension + // and contribute the likelihoods for transitions to this state + double[][][] prevState = transitionTrellis[idx-1]; + double[][][] thisState = transitionTrellis[idx]; + Genotype genotype = GLs.get(sample); + if ( genotype.isNoCall() || genotype.isFiltered() ) { + thisState = prevState.clone(); + } else { + double[] likelihoods = genotype.getLikelihoods().getAsVector(); + int dim1min = Math.max(0, alleleCount[0]-2*(sampleIndices.size()-idx+1)); + int dim1max = Math.min(prevMaxState,alleleCount[0]); + int dim2min = Math.max(0,alleleCount[1]-2*(sampleIndices.size()-idx+1)); + int dim2max = Math.min(prevMaxState,alleleCount[1]); + int dim3min = Math.max(0,alleleCount[2]-2*(sampleIndices.size()-idx+1)); + int dim3max = Math.min(prevMaxState,alleleCount[2]); + // cue annoying nested for loop + for ( int a1 = dim1min ; a1 <= dim1max; a1++ ) { + for ( int a2 = dim2min; a2 <= dim2max; a2++ ) { + for ( int a3 = dim3min; a3 <= dim3max; a3++ ) { + double base = prevState[a1][a2][a3]; + for ( int likIdx : likelihoodIndexesToUse ) { + int[] offsets = calculateOffsets(PLcache[likIdx]); + thisState[a1+offsets[1]][a2+offsets[2]][a3+offsets[3]] = base + likelihoods[likIdx]; + } + } + } + } + prevMaxState += 2; + } + idx++; + } + + // after all that pain, we have a fully calculated trellis. Now just march backwards from the EAC state and + // assign genotypes along the greedy path + + GenotypesContext calls = GenotypesContext.create(sampleIndices.size()); + int[] state = alleleCount; + for ( String sample : Utils.reverse(sampleIndices) ) { + --idx; + // the next state will be the maximum achievable state + Genotype g = GLs.get(sample); + if ( g.isNoCall() || ! g.hasLikelihoods() ) { + calls.add(g); + continue; + } + + // subset to the new likelihoods. These are not used except for subsetting in the context iself. + // i.e. they are not a part of the calculation. + final double[] originalLikelihoods = GLs.get(sample).getLikelihoods().getAsVector(); + double[] newLikelihoods; + if ( likelihoodIndexesToUse == null ) { + newLikelihoods = originalLikelihoods; + } else { + newLikelihoods = new double[likelihoodIndexesToUse.size()]; + int newIndex = 0; + for ( int oldIndex : likelihoodIndexesToUse ) + newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; + + // might need to re-normalize + newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); + } + + // todo -- alter this. For ease of programming, likelihood indeces are + // todo -- used to iterate over achievable states. + double max = Double.NEGATIVE_INFINITY; + int[] bestState = null; + int[] bestAlleles = null; + int bestLikIdx = -1; + for ( int likIdx : likelihoodIndexesToUse ) { + int[] offsets = calculateOffsets(PLcache[likIdx]); + double val = transitionTrellis[idx-1][state[0]-offsets[0]][state[1]-offsets[1]][state[2]-offsets[2]]; + if ( val > max ) { + max = val; + bestState = new int[] { state[0]-offsets[0],state[1]-offsets[1],state[2]-offsets[2]}; + bestAlleles = PLcache[likIdx]; + bestLikIdx = likIdx; + } + } + state = bestState; + List gtAlleles = new ArrayList(2); + gtAlleles.add(newAlleles.get(bestAlleles[0])); + gtAlleles.add(newAlleles.get(bestAlleles[1])); + + final double qual = numNewAltAlleles == 0 ? Genotype.NO_LOG10_PERROR : GenotypeLikelihoods.getQualFromLikelihoods(bestLikIdx, newLikelihoods); + Map attrs = new HashMap(g.getAttributes()); + if ( numNewAltAlleles == 0 ) + attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY); + else + attrs.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(newLikelihoods)); + calls.add(new Genotype(sample, gtAlleles, qual, null, attrs, false)); + + } + return calls; + } + + private static int[] calculateOffsets(int[] alleleIndeces) { + int[] offsets = new int[4]; + for ( int i = 0; i < alleleIndeces.length; i++ ) { + offsets[alleleIndeces[i]]++; + } + + return offsets; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index d893e620e..6410d619d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -28,13 +28,13 @@ package org.broadinstitute.sting.gatk.walkers.indels; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -409,9 +409,9 @@ public class PairHMMIndelErrorModel { } } else { - //System.out.format("%d %s\n",p.getRead().getAlignmentStart(), p.getRead().getClass().getName()); - SAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); - if (read == null) + // System.out.format("%d %s\n",p.getRead().getAlignmentStart(), p.getRead().getClass().getName()); + GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); + if (read.isEmpty()) continue; if(ReadUtils.is454Read(read)) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelectorWalker.java index ae11d8102..cd4c57136 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelectorWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelectorWalker.java @@ -106,37 +106,70 @@ public class ValidationSiteSelectorWalker extends RodWalker { POLY_BASED_ON_GL } + /** + * The input VCF file + */ @Input(fullName="variant", shortName = "V", doc="Input VCF file, can be specified multiple times", required=true) public List> variants; + /** + * The output VCF file + */ @Output(doc="File to which variants should be written",required=true) protected VCFWriter vcfWriter = null; + /** + * Sample name(s) to subset the input VCF to, prior to selecting variants. -sn A -sn B subsets to samples A and B. + */ @Argument(fullName="sample_name", shortName="sn", doc="Include genotypes from this sample. Can be specified multiple times", required=false) public Set sampleNames = new HashSet(0); + /** + * Sample regexps to subset the input VCF to, prior to selecting variants. -sn NA12* subsets to all samples with prefix NA12 + */ @Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times", required=false) public Set sampleExpressions ; + /** + * File containing a list of sample names to subset the input vcf to. Equivalent to specifying the contents of the file separately with -sn + */ @Input(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line) to include. Can be specified multiple times", required=false) public Set sampleFiles; + /** + * A mode for selecting sites based on sample-level data. See the wiki documentation for more information. + */ @Argument(fullName="sampleMode", shortName="sampleMode", doc="Sample selection mode", required=false) private SAMPLE_SELECTION_MODE sampleMode = SAMPLE_SELECTION_MODE.NONE; + /** + * An P[nonref] threshold for SAMPLE_SELECTION_MODE=POLY_BASED_ON_GL. See the wiki documentation for more information. + */ @Argument(shortName="samplePNonref",fullName="samplePNonref", doc="GL-based selection mode only: the probability" + " that a site is non-reference in the samples for which to include the site",required=false) private double samplePNonref = 0.99; + /** + * The number of sites in your validation set + */ @Argument(fullName="numValidationSites", shortName="numSites", doc="Number of output validation sites", required=true) private int numValidationSites; + /** + * Do not exclude filtered sites (e.g. not PASS or .) from consideration for validation + */ @Argument(fullName="includeFilteredSites", shortName="ifs", doc="If true, will include filtered sites in set to choose variants from", required=false) private boolean INCLUDE_FILTERED_SITES = false; + /** + * Argument for the frequency selection mode. (AC/AF/AN) are taken from VCF info field, not recalculated. Typically specified for sites-only VCFs that still have AC/AF/AN information. + */ @Argument(fullName="ignoreGenotypes", shortName="ignoreGenotypes", doc="If true, will ignore genotypes in VCF, will take AC,AF from annotations and will make no sample selection", required=false) private boolean IGNORE_GENOTYPES = false; + /** + * Argument for the frequency selection mode. Allows reference (non-polymorphic) sites to be included in the validation set. + */ @Argument(fullName="ignorePolymorphicStatus", shortName="ignorePolymorphicStatus", doc="If true, will ignore polymorphic status in VCF, and will take VCF record directly without pre-selection", required=false) private boolean IGNORE_POLYMORPHIC = false; @@ -145,19 +178,14 @@ public class ValidationSiteSelectorWalker extends RodWalker { private int numFrequencyBins = 20; /** - * This argument selects allele frequency selection mode: - * KEEP_AF_SPECTRUM will choose variants so that the resulting allele frequency spectrum matches as closely as possible the input set - * UNIFORM will choose variants uniformly without regard to their allele frequency. - * - */ + * This argument selects allele frequency selection mode. See the wiki for more information. + */ @Argument(fullName="frequencySelectionMode", shortName="freqMode", doc="Allele Frequency selection mode", required=false) private AF_COMPUTATION_MODE freqMode = AF_COMPUTATION_MODE.KEEP_AF_SPECTRUM; /** - * This argument selects particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria. - * When specified one or more times, a particular type of variant is selected. - * - */ + * This argument selects particular kinds of variants (i.e. SNP, INDEL) out of a list. If left unspecified, all types are considered. + */ @Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times", required=false) private List TYPES_TO_INCLUDE = new ArrayList(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java index a271d3c35..3c7c6f00c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java @@ -46,6 +46,7 @@ import java.util.*; public class VariantSummary extends VariantEvaluator implements StandardEval { final protected static Logger logger = Logger.getLogger(VariantSummary.class); + /** Indels with size greater than this value are tallied in the CNV column */ private final static int MAX_INDEL_LENGTH = 50; private final static double MIN_CNV_OVERLAP = 0.5; private VariantEvalWalker walker; @@ -196,14 +197,16 @@ public class VariantSummary extends VariantEvaluator implements StandardEval { } private final boolean overlapsKnownCNV(VariantContext cnv) { - final GenomeLoc loc = walker.getGenomeLocParser().createGenomeLoc(cnv, true); - IntervalTree intervalTree = knownCNVs.get(loc.getContig()); + if ( knownCNVs != null ) { + final GenomeLoc loc = walker.getGenomeLocParser().createGenomeLoc(cnv, true); + IntervalTree intervalTree = knownCNVs.get(loc.getContig()); - final Iterator> nodeIt = intervalTree.overlappers(loc.getStart(), loc.getStop()); - while ( nodeIt.hasNext() ) { - final double overlapP = loc.reciprocialOverlapFraction(nodeIt.next().getValue()); - if ( overlapP > MIN_CNV_OVERLAP ) - return true; + final Iterator> nodeIt = intervalTree.overlappers(loc.getStart(), loc.getStop()); + while ( nodeIt.hasNext() ) { + final double overlapP = loc.reciprocialOverlapFraction(nodeIt.next().getValue()); + if ( overlapP > MIN_CNV_OVERLAP ) + return true; + } } return false; @@ -224,7 +227,7 @@ public class VariantSummary extends VariantEvaluator implements StandardEval { allVariantCounts.inc(type, ALL); // type specific calculations - if ( type == Type.SNP ) { + if ( type == Type.SNP && eval.isBiallelic() ) { titvTable = VariantContextUtils.isTransition(eval) ? transitionsPerSample : transversionsPerSample; titvTable.inc(type, ALL); } diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 345161416..6941b888b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -145,7 +145,7 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome } return new GenomeLoc(getContig(), this.contigIndex, - Math.min(getStart(), that.getStart()), + Math.min( getStart(), that.getStart() ), Math.max( getStop(), that.getStop()) ); } @@ -465,4 +465,8 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome private final static double overlapPercent(final GenomeLoc gl1, final GenomeLoc gl2) { return (1.0 * gl1.intersect(gl2).size()) / gl1.size(); } + + public long sizeOfOverlap( final GenomeLoc that ) { + return ( this.overlapsP(that) ? Math.min( getStop(), that.getStop() ) - Math.max( getStart(), that.getStart() ) : 0L ); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 759e1649d..737f4bb5f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.utils; +import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -49,8 +50,11 @@ public class MathUtils { */ - /** Private constructor. No instantiating this class! */ - private MathUtils() {} + /** + * Private constructor. No instantiating this class! + */ + private MathUtils() { + } @Requires({"d > 0.0"}) public static int fastPositiveRound(double d) { @@ -58,21 +62,21 @@ public class MathUtils { } public static int fastRound(double d) { - if ( d > 0.0 ) { + if (d > 0.0) { return fastPositiveRound(d); } else { - return -1*fastPositiveRound(-1*d); + return -1 * fastPositiveRound(-1 * d); } } public static double sum(Collection numbers) { - return sum(numbers,false); + return sum(numbers, false); } - public static double sum( Collection numbers, boolean ignoreNan ) { + public static double sum(Collection numbers, boolean ignoreNan) { double sum = 0; - for ( Number n : numbers ) { - if ( ! ignoreNan || ! Double.isNaN(n.doubleValue())) { + for (Number n : numbers) { + if (!ignoreNan || !Double.isNaN(n.doubleValue())) { sum += n.doubleValue(); } } @@ -82,66 +86,72 @@ public class MathUtils { public static int nonNanSize(Collection numbers) { int size = 0; - for ( Number n : numbers) { + for (Number n : numbers) { size += Double.isNaN(n.doubleValue()) ? 0 : 1; } return size; } - public static double average( Collection numbers, boolean ignoreNan) { - if ( ignoreNan ) { - return sum(numbers,true)/nonNanSize(numbers); + public static double average(Collection numbers, boolean ignoreNan) { + if (ignoreNan) { + return sum(numbers, true) / nonNanSize(numbers); } else { - return sum(numbers,false)/nonNanSize(numbers); + return sum(numbers, false) / nonNanSize(numbers); } } - public static double variance( Collection numbers, Number mean, boolean ignoreNan ) { + public static double variance(Collection numbers, Number mean, boolean ignoreNan) { double mn = mean.doubleValue(); double var = 0; - for ( Number n : numbers ) { var += ( ! ignoreNan || ! Double.isNaN(n.doubleValue())) ? (n.doubleValue()-mn)*(n.doubleValue()-mn) : 0; } - if ( ignoreNan ) { return var/(nonNanSize(numbers)-1); } - return var/(numbers.size()-1); + for (Number n : numbers) { + var += (!ignoreNan || !Double.isNaN(n.doubleValue())) ? (n.doubleValue() - mn) * (n.doubleValue() - mn) : 0; + } + if (ignoreNan) { + return var / (nonNanSize(numbers) - 1); + } + return var / (numbers.size() - 1); } public static double variance(Collection numbers, Number mean) { - return variance(numbers,mean,false); + return variance(numbers, mean, false); } public static double variance(Collection numbers, boolean ignoreNan) { - return variance(numbers,average(numbers,ignoreNan),ignoreNan); + return variance(numbers, average(numbers, ignoreNan), ignoreNan); } public static double variance(Collection numbers) { - return variance(numbers,average(numbers,false),false); + return variance(numbers, average(numbers, false), false); } public static double sum(double[] values) { double s = 0.0; - for ( double v : values) s += v; + for (double v : values) s += v; return s; } /** * Calculates the log10 cumulative sum of an array with log10 probabilities + * * @param log10p the array with log10 probabilites - * @param upTo index in the array to calculate the cumsum up to + * @param upTo index in the array to calculate the cumsum up to * @return the log10 of the cumulative sum */ - public static double log10CumulativeSumLog10(double [] log10p, int upTo) { + public static double log10CumulativeSumLog10(double[] log10p, int upTo) { return log10sumLog10(log10p, 0, upTo); } /** * Converts a real space array of probabilities into a log10 array + * * @param prRealSpace * @return */ public static double[] toLog10(double[] prRealSpace) { double[] log10s = new double[prRealSpace.length]; - for ( int i = 0; i < prRealSpace.length; i++ ) + for (int i = 0; i < prRealSpace.length; i++) log10s[i] = Math.log10(prRealSpace[i]); return log10s; } @@ -154,7 +164,7 @@ public class MathUtils { double sum = 0.0; double maxValue = Utils.findMaxEntry(log10p); - for ( int i = start; i < finish; i++ ) { + for (int i = start; i < finish; i++) { sum += Math.pow(10.0, log10p[i] - maxValue); } @@ -163,13 +173,13 @@ public class MathUtils { public static double sumDoubles(List values) { double s = 0.0; - for ( double v : values) s += v; + for (double v : values) s += v; return s; } public static int sumIntegers(List values) { int s = 0; - for ( int v : values) s += v; + for (int v : values) s += v; return s; } @@ -185,11 +195,11 @@ public class MathUtils { } public static boolean wellFormedDouble(double val) { - return ! Double.isInfinite(val) && ! Double.isNaN(val); + return !Double.isInfinite(val) && !Double.isNaN(val); } public static double bound(double value, double minBoundary, double maxBoundary) { - return Math.max(Math.min(value, maxBoundary), minBoundary); + return Math.max(Math.min(value, maxBoundary), minBoundary); } public static boolean isBounded(double val, double lower, double upper) { @@ -197,7 +207,7 @@ public class MathUtils { } public static boolean isPositive(double val) { - return ! isNegativeOrZero(val); + return !isNegativeOrZero(val); } public static boolean isPositiveOrZero(double val) { @@ -209,17 +219,19 @@ public class MathUtils { } public static boolean isNegative(double val) { - return ! isPositiveOrZero(val); + return !isPositiveOrZero(val); } /** * Compares double values for equality (within 1e-6), or inequality. * - * @param a the first double value - * @param b the second double value - * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. + * @param a the first double value + * @param b the second double value + * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. */ - public static byte compareDoubles(double a, double b) { return compareDoubles(a, b, 1e-6); } + public static byte compareDoubles(double a, double b) { + return compareDoubles(a, b, 1e-6); + } /** * Compares double values for equality (within epsilon), or inequality. @@ -227,23 +239,28 @@ public class MathUtils { * @param a the first double value * @param b the second double value * @param epsilon the precision within which two double values will be considered equal - * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. + * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. */ - public static byte compareDoubles(double a, double b, double epsilon) - { - if (Math.abs(a - b) < epsilon) { return 0; } - if (a > b) { return -1; } + public static byte compareDoubles(double a, double b, double epsilon) { + if (Math.abs(a - b) < epsilon) { + return 0; + } + if (a > b) { + return -1; + } return 1; } /** * Compares float values for equality (within 1e-6), or inequality. * - * @param a the first float value - * @param b the second float value - * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. + * @param a the first float value + * @param b the second float value + * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. */ - public static byte compareFloats(float a, float b) { return compareFloats(a, b, 1e-6f); } + public static byte compareFloats(float a, float b) { + return compareFloats(a, b, 1e-6f); + } /** * Compares float values for equality (within epsilon), or inequality. @@ -251,47 +268,50 @@ public class MathUtils { * @param a the first float value * @param b the second float value * @param epsilon the precision within which two float values will be considered equal - * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. + * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. */ - public static byte compareFloats(float a, float b, float epsilon) - { - if (Math.abs(a - b) < epsilon) { return 0; } - if (a > b) { return -1; } + public static byte compareFloats(float a, float b, float epsilon) { + if (Math.abs(a - b) < epsilon) { + return 0; + } + if (a > b) { + return -1; + } return 1; } - public static double NormalDistribution(double mean, double sd, double x) - { - double a = 1.0 / (sd*Math.sqrt(2.0 * Math.PI)); - double b = Math.exp(-1.0 * (Math.pow(x - mean,2.0)/(2.0 * sd * sd))); + public static double NormalDistribution(double mean, double sd, double x) { + double a = 1.0 / (sd * Math.sqrt(2.0 * Math.PI)); + double b = Math.exp(-1.0 * (Math.pow(x - mean, 2.0) / (2.0 * sd * sd))); return a * b; } - public static double binomialCoefficient (int n, int k) { + public static double binomialCoefficient(int n, int k) { return Math.pow(10, log10BinomialCoefficient(n, k)); } + /** * Computes a binomial probability. This is computed using the formula - * - * B(k; n; p) = [ n! / ( k! (n - k)! ) ] (p^k)( (1-p)^k ) - * + *

+ * B(k; n; p) = [ n! / ( k! (n - k)! ) ] (p^k)( (1-p)^k ) + *

* where n is the number of trials, k is the number of successes, and p is the probability of success * - * @param n number of Bernoulli trials - * @param k number of successes - * @param p probability of success - * - * @return the binomial probability of the specified configuration. Computes values down to about 1e-237. + * @param n number of Bernoulli trials + * @param k number of successes + * @param p probability of success + * @return the binomial probability of the specified configuration. Computes values down to about 1e-237. */ - public static double binomialProbability (int n, int k, double p) { + public static double binomialProbability(int n, int k, double p) { return Math.pow(10, log10BinomialProbability(n, k, Math.log10(p))); } /** * Performs the cumulative sum of binomial probabilities, where the probability calculation is done in log space. - * @param start - start of the cumulant sum (over hits) - * @param end - end of the cumulant sum (over hits) - * @param total - number of attempts for the number of hits + * + * @param start - start of the cumulant sum (over hits) + * @param end - end of the cumulant sum (over hits) + * @param total - number of attempts for the number of hits * @param probHit - probability of a successful hit * @return - returns the cumulative probability */ @@ -300,11 +320,11 @@ public class MathUtils { double prevProb; BigDecimal probCache = BigDecimal.ZERO; - for(int hits = start; hits < end; hits++) { + for (int hits = start; hits < end; hits++) { prevProb = cumProb; double probability = binomialProbability(total, hits, probHit); cumProb += probability; - if ( probability > 0 && cumProb - prevProb < probability/2 ) { // loss of precision + if (probability > 0 && cumProb - prevProb < probability / 2) { // loss of precision probCache = probCache.add(new BigDecimal(prevProb)); cumProb = 0.0; hits--; // repeat loop @@ -314,20 +334,20 @@ public class MathUtils { return probCache.add(new BigDecimal(cumProb)).doubleValue(); } - + /** * Computes a multinomial coefficient efficiently avoiding overflow even for large numbers. * This is computed using the formula: - * - * M(x1,x2,...,xk; n) = [ n! / (x1! x2! ... xk!) ] - * + *

+ * M(x1,x2,...,xk; n) = [ n! / (x1! x2! ... xk!) ] + *

* where xi represents the number of times outcome i was observed, n is the number of total observations. * In this implementation, the value of n is inferred as the sum over i of xi. * - * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed - * @return the multinomial of the specified configuration. + * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed + * @return the multinomial of the specified configuration. */ - public static double multinomialCoefficient (int [] k) { + public static double multinomialCoefficient(int[] k) { int n = 0; for (int xi : k) { n += xi; @@ -339,37 +359,38 @@ public class MathUtils { /** * Computes a multinomial probability efficiently avoiding overflow even for large numbers. * This is computed using the formula: - * - * M(x1,x2,...,xk; n; p1,p2,...,pk) = [ n! / (x1! x2! ... xk!) ] (p1^x1)(p2^x2)(...)(pk^xk) - * + *

+ * M(x1,x2,...,xk; n; p1,p2,...,pk) = [ n! / (x1! x2! ... xk!) ] (p1^x1)(p2^x2)(...)(pk^xk) + *

* where xi represents the number of times outcome i was observed, n is the number of total observations, and * pi represents the probability of the i-th outcome to occur. In this implementation, the value of n is * inferred as the sum over i of xi. * - * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed - * @param p a double[] of probabilities, where each element represents the probability a given outcome can occur - * @return the multinomial probability of the specified configuration. + * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed + * @param p a double[] of probabilities, where each element represents the probability a given outcome can occur + * @return the multinomial probability of the specified configuration. */ - public static double multinomialProbability (int[] k, double[] p) { + public static double multinomialProbability(int[] k, double[] p) { if (p.length != k.length) throw new UserException.BadArgumentValue("p and k", "Array of log10 probabilities must have the same size as the array of number of sucesses: " + p.length + ", " + k.length); int n = 0; - double [] log10P = new double[p.length]; - for (int i=0; i array[maxI] ) + for (int i = 0; i < array.length; i++) { + if (maxI == -1 || array[i] > array[maxI]) maxI = i; } @@ -532,11 +554,11 @@ public class MathUtils { } public static int maxElementIndex(int[] array) { - if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); + if (array == null) throw new IllegalArgumentException("Array cannot be null!"); int maxI = -1; - for ( int i = 0; i < array.length; i++ ) { - if ( maxI == -1 || array[i] > array[maxI] ) + for (int i = 0; i < array.length; i++) { + if (maxI == -1 || array[i] > array[maxI]) maxI = i; } @@ -551,16 +573,20 @@ public class MathUtils { return array[minElementIndex(array)]; } + public static int arrayMin(int[] array) { + return array[minElementIndex(array)]; + } + public static byte arrayMin(byte[] array) { return array[minElementIndex(array)]; } public static int minElementIndex(double[] array) { - if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); + if (array == null) throw new IllegalArgumentException("Array cannot be null!"); int minI = -1; - for ( int i = 0; i < array.length; i++ ) { - if ( minI == -1 || array[i] < array[minI] ) + for (int i = 0; i < array.length; i++) { + if (minI == -1 || array[i] < array[minI]) minI = i; } @@ -568,32 +594,44 @@ public class MathUtils { } public static int minElementIndex(byte[] array) { - if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); + if (array == null) throw new IllegalArgumentException("Array cannot be null!"); int minI = -1; - for ( int i = 0; i < array.length; i++ ) { - if ( minI == -1 || array[i] < array[minI] ) + for (int i = 0; i < array.length; i++) { + if (minI == -1 || array[i] < array[minI]) minI = i; } return minI; - } + } + + public static int minElementIndex(int[] array) { + if (array == null) throw new IllegalArgumentException("Array cannot be null!"); + + int minI = -1; + for (int i = 0; i < array.length; i++) { + if (minI == -1 || array[i] < array[minI]) + minI = i; + } + + return minI; + } public static int arrayMaxInt(List array) { - if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); - if ( array.size() == 0 ) throw new IllegalArgumentException("Array size cannot be 0!"); + if (array == null) throw new IllegalArgumentException("Array cannot be null!"); + if (array.size() == 0) throw new IllegalArgumentException("Array size cannot be 0!"); int m = array.get(0); - for ( int e : array ) m = Math.max(m, e); + for (int e : array) m = Math.max(m, e); return m; } public static double arrayMaxDouble(List array) { - if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); - if ( array.size() == 0 ) throw new IllegalArgumentException("Array size cannot be 0!"); + if (array == null) throw new IllegalArgumentException("Array cannot be null!"); + if (array.size() == 0) throw new IllegalArgumentException("Array size cannot be 0!"); double m = array.get(0); - for ( double e : array ) m = Math.max(m, e); + for (double e : array) m = Math.max(m, e); return m; } @@ -636,7 +674,7 @@ public class MathUtils { for (byte v : vals) { sum += v; } - return (byte) Math.floor(sum/vals.length); + return (byte) Math.floor(sum / vals.length); } public static double averageDouble(List vals) { @@ -749,7 +787,9 @@ public class MathUtils { } - /** Draw N random elements from list. */ + /** + * Draw N random elements from list. + */ public static List randomSubset(List list, int N) { if (list.size() <= N) { return list; @@ -770,6 +810,25 @@ public class MathUtils { return ans; } + /** + * Draw N random elements from an array. + * + * @param array your objects + * @param n number of elements to select at random from the list + * @return a new list with the N randomly chosen elements from list + */ + @Requires({"array != null", "n>=0"}) + @Ensures({"result != null", "result.length == Math.min(n, array.length)"}) + public static Object[] randomSubset(final Object[] array, final int n) { + if (array.length <= n) + return array.clone(); + + Object[] shuffledArray = arrayShuffle(array); + Object[] result = new Object[n]; + System.arraycopy(shuffledArray, 0, result, 0, n); + return result; + } + public static double percentage(double x, double base) { return (base > 0 ? (x / base) * 100.0 : 0); } @@ -799,7 +858,7 @@ public class MathUtils { return count; } - public static int countOccurrences(byte element, byte [] array) { + public static int countOccurrences(byte element, byte[] array) { int count = 0; for (byte y : array) { if (element == y) @@ -814,13 +873,13 @@ public class MathUtils { * Better than sorting if N (number of elements to return) is small * * @param array the array - * @param n number of top elements to return + * @param n number of top elements to return * @return the n larger elements of the array */ - public static Collection getNMaxElements(double [] array, int n) { + public static Collection getNMaxElements(double[] array, int n) { ArrayList maxN = new ArrayList(n); double lastMax = Double.MAX_VALUE; - for (int i=0; i sampleIndicesWithReplacement(int n, int k) { - ArrayList chosen_balls = new ArrayList (k); - for (int i=0; i< k; i++) { + ArrayList chosen_balls = new ArrayList(k); + for (int i = 0; i < k; i++) { //Integer chosen_ball = balls[rand.nextInt(k)]; chosen_balls.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(n)); //balls.remove(chosen_ball); @@ -872,11 +931,11 @@ public class MathUtils { /** * Given a list of indices into a list, return those elements of the list with the possibility of drawing list elements multiple times - - * @param indices the list of indices for elements to extract - * @param list the list from which the elements should be extracted - * @param the template type of the ArrayList - * @return a new ArrayList consisting of the elements at the specified indices + * + * @param indices the list of indices for elements to extract + * @param list the list from which the elements should be extracted + * @param the template type of the ArrayList + * @return a new ArrayList consisting of the elements at the specified indices */ static public ArrayList sliceListByIndices(List indices, List list) { ArrayList subset = new ArrayList(); @@ -898,18 +957,18 @@ public class MathUtils { ArrayList equalToX = new ArrayList(); ArrayList greaterThanX = new ArrayList(); - for(Comparable y : list) { - if(x.compareTo(y) > 0) { + for (Comparable y : list) { + if (x.compareTo(y) > 0) { lessThanX.add(y); - } else if(x.compareTo(y) < 0) { + } else if (x.compareTo(y) < 0) { greaterThanX.add(y); } else equalToX.add(y); } - if(lessThanX.size() > orderStat) + if (lessThanX.size() > orderStat) return orderStatisticSearch(orderStat, lessThanX); - else if(lessThanX.size() + equalToX.size() >= orderStat) + else if (lessThanX.size() + equalToX.size() >= orderStat) return orderStat; else return orderStatisticSearch(orderStat - lessThanX.size() - equalToX.size(), greaterThanX); @@ -918,7 +977,7 @@ public class MathUtils { public static Object getMedian(List list) { - return orderStatisticSearch((int) Math.ceil(list.size()/2), list); + return orderStatisticSearch((int) Math.ceil(list.size() / 2), list); } public static byte getQScoreOrderStatistic(List reads, List offsets, int k) { @@ -926,7 +985,7 @@ public class MathUtils { // list index maps to a q-score only through the offset index // returns the kth-largest q-score. - if( reads.size() == 0) { + if (reads.size() == 0) { return 0; } @@ -938,15 +997,15 @@ public class MathUtils { final byte qk = reads.get(k).getBaseQualities()[offsets.get(k)]; - for(int iter = 0; iter < reads.size(); iter ++) { + for (int iter = 0; iter < reads.size(); iter++) { SAMRecord read = reads.get(iter); int offset = offsets.get(iter); byte quality = read.getBaseQualities()[offset]; - if(quality < qk) { + if (quality < qk) { lessThanQReads.add(read); lessThanQOffsets.add(offset); - } else if(quality > qk) { + } else if (quality > qk) { greaterThanQReads.add(read); greaterThanQOffsets.add(offset); } else { @@ -954,9 +1013,9 @@ public class MathUtils { } } - if(lessThanQReads.size() > k) + if (lessThanQReads.size() > k) return getQScoreOrderStatistic(lessThanQReads, lessThanQOffsets, k); - else if(equalToQReads.size() + lessThanQReads.size() >= k) + else if (equalToQReads.size() + lessThanQReads.size() >= k) return qk; else return getQScoreOrderStatistic(greaterThanQReads, greaterThanQOffsets, k - lessThanQReads.size() - equalToQReads.size()); @@ -964,10 +1023,11 @@ public class MathUtils { } public static byte getQScoreMedian(List reads, List offsets) { - return getQScoreOrderStatistic(reads, offsets, (int)Math.floor(reads.size()/2.)); + return getQScoreOrderStatistic(reads, offsets, (int) Math.floor(reads.size() / 2.)); } - /** A utility class that computes on the fly average and standard deviation for a stream of numbers. + /** + * A utility class that computes on the fly average and standard deviation for a stream of numbers. * The number of observations does not have to be known in advance, and can be also very big (so that * it could overflow any naive summation-based scheme or cause loss of precision). * Instead, adding a new number observed @@ -983,20 +1043,31 @@ public class MathUtils { public void add(double obs) { obs_count++; double oldMean = mean; - mean += ( obs - mean ) / obs_count; // update mean - s += ( obs - oldMean ) * ( obs - mean ); + mean += (obs - mean) / obs_count; // update mean + s += (obs - oldMean) * (obs - mean); } public void addAll(Collection col) { - for ( Number o : col ) { + for (Number o : col) { add(o.doubleValue()); } } - public double mean() { return mean; } - public double stddev() { return Math.sqrt(s/(obs_count - 1)); } - public double var() { return s/(obs_count - 1); } - public long observationCount() { return obs_count; } + public double mean() { + return mean; + } + + public double stddev() { + return Math.sqrt(s / (obs_count - 1)); + } + + public double var() { + return s / (obs_count - 1); + } + + public long observationCount() { + return obs_count; + } public RunningAverage clone() { RunningAverage ra = new RunningAverage(); @@ -1007,71 +1078,86 @@ public class MathUtils { } public void merge(RunningAverage other) { - if ( this.obs_count > 0 || other.obs_count > 0 ) { // if we have any observations at all - this.mean = ( this.mean * this.obs_count + other.mean * other.obs_count ) / ( this.obs_count + other.obs_count ); + if (this.obs_count > 0 || other.obs_count > 0) { // if we have any observations at all + this.mean = (this.mean * this.obs_count + other.mean * other.obs_count) / (this.obs_count + other.obs_count); this.s += other.s; } this.obs_count += other.obs_count; } } - + // // useful common utility routines // - public static double rate(long n, long d) { return n / (1.0 * Math.max(d, 1)); } - public static double rate(int n, int d) { return n / (1.0 * Math.max(d, 1)); } + public static double rate(long n, long d) { + return n / (1.0 * Math.max(d, 1)); + } - public static long inverseRate(long n, long d) { return n == 0 ? 0 : d / Math.max(n, 1); } - public static long inverseRate(int n, int d) { return n == 0 ? 0 : d / Math.max(n, 1); } + public static double rate(int n, int d) { + return n / (1.0 * Math.max(d, 1)); + } - public static double ratio(int num, int denom) { return ((double)num) / (Math.max(denom, 1)); } - public static double ratio(long num, long denom) { return ((double)num) / (Math.max(denom, 1)); } + public static long inverseRate(long n, long d) { + return n == 0 ? 0 : d / Math.max(n, 1); + } + + public static long inverseRate(int n, int d) { + return n == 0 ? 0 : d / Math.max(n, 1); + } + + public static double ratio(int num, int denom) { + return ((double) num) / (Math.max(denom, 1)); + } + + public static double ratio(long num, long denom) { + return ((double) num) / (Math.max(denom, 1)); + } public static final double[] log10Cache; public static final double[] jacobianLogTable; public static final int JACOBIAN_LOG_TABLE_SIZE = 101; public static final double JACOBIAN_LOG_TABLE_STEP = 0.1; - public static final double INV_JACOBIAN_LOG_TABLE_STEP = 1.0/JACOBIAN_LOG_TABLE_STEP; + public static final double INV_JACOBIAN_LOG_TABLE_STEP = 1.0 / JACOBIAN_LOG_TABLE_STEP; public static final double MAX_JACOBIAN_TOLERANCE = 10.0; private static final int MAXN = 11000; private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients static { log10Cache = new double[LOG10_CACHE_SIZE]; - jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE]; + jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE]; log10Cache[0] = Double.NEGATIVE_INFINITY; - for (int k=1; k < LOG10_CACHE_SIZE; k++) + for (int k = 1; k < LOG10_CACHE_SIZE; k++) log10Cache[k] = Math.log10(k); - for (int k=0; k < JACOBIAN_LOG_TABLE_SIZE; k++) { - jacobianLogTable[k] = Math.log10(1.0+Math.pow(10.0,-((double)k) - * JACOBIAN_LOG_TABLE_STEP)); + for (int k = 0; k < JACOBIAN_LOG_TABLE_SIZE; k++) { + jacobianLogTable[k] = Math.log10(1.0 + Math.pow(10.0, -((double) k) + * JACOBIAN_LOG_TABLE_STEP)); - } + } } static public double softMax(final double[] vec) { double acc = vec[0]; - for (int k=1; k < vec.length; k++) - acc = softMax(acc,vec[k]); + for (int k = 1; k < vec.length; k++) + acc = softMax(acc, vec[k]); return acc; } static public double max(double x0, double x1, double x2) { - double a = Math.max(x0,x1); - return Math.max(a,x2); + double a = Math.max(x0, x1); + return Math.max(a, x2); } - - static public double softMax(final double x0, final double x1, final double x2) { - // compute naively log10(10^x[0] + 10^x[1]+...) - // return Math.log10(MathUtils.sumLog10(vec)); - // better approximation: do Jacobian logarithm function on data pairs - double a = softMax(x0,x1); - return softMax(a,x2); + static public double softMax(final double x0, final double x1, final double x2) { + // compute naively log10(10^x[0] + 10^x[1]+...) + // return Math.log10(MathUtils.sumLog10(vec)); + + // better approximation: do Jacobian logarithm function on data pairs + double a = softMax(x0, x1); + return softMax(a, x2); } static public double softMax(final double x, final double y) { @@ -1084,49 +1170,50 @@ public class MathUtils { // slow exact version: // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y)); - double diff = x-y; + double diff = x - y; if (diff > MAX_JACOBIAN_TOLERANCE) return x; else if (diff < -MAX_JACOBIAN_TOLERANCE) return y; else if (diff >= 0) { - int ind = (int)(diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5); + int ind = (int) (diff * INV_JACOBIAN_LOG_TABLE_STEP + 0.5); return x + jacobianLogTable[ind]; - } - else { - int ind = (int)(-diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5); + } else { + int ind = (int) (-diff * INV_JACOBIAN_LOG_TABLE_STEP + 0.5); return y + jacobianLogTable[ind]; } } - public static double phredScaleToProbability (byte q) { - return Math.pow(10,(-q)/10.0); + public static double phredScaleToProbability(byte q) { + return Math.pow(10, (-q) / 10.0); } - public static double phredScaleToLog10Probability (byte q) { - return ((-q)/10.0); + public static double phredScaleToLog10Probability(byte q) { + return ((-q) / 10.0); } /** * Returns the phred scaled value of probability p + * * @param p probability (between 0 and 1). * @return phred scaled probability of p */ - public static byte probabilityToPhredScale (double p) { + public static byte probabilityToPhredScale(double p) { return (byte) ((-10) * Math.log10(p)); } - public static double log10ProbabilityToPhredScale (double log10p) { + public static double log10ProbabilityToPhredScale(double log10p) { return (-10) * log10p; } /** * Converts LN to LOG10 + * * @param ln log(x) * @return log10(x) */ - public static double lnToLog10 (double ln) { + public static double lnToLog10(double ln) { return ln * Math.log10(Math.exp(1)); } @@ -1134,169 +1221,190 @@ public class MathUtils { * Constants to simplify the log gamma function calculation. */ private static final double - zero = 0.0, - one = 1.0, - half = .5, - a0 = 7.72156649015328655494e-02, - a1 = 3.22467033424113591611e-01, - a2 = 6.73523010531292681824e-02, - a3 = 2.05808084325167332806e-02, - a4 = 7.38555086081402883957e-03, - a5 = 2.89051383673415629091e-03, - a6 = 1.19270763183362067845e-03, - a7 = 5.10069792153511336608e-04, - a8 = 2.20862790713908385557e-04, - a9 = 1.08011567247583939954e-04, - a10 = 2.52144565451257326939e-05, - a11 = 4.48640949618915160150e-05, - tc = 1.46163214496836224576e+00, - tf = -1.21486290535849611461e-01, - tt = -3.63867699703950536541e-18, - t0 = 4.83836122723810047042e-01, - t1 = -1.47587722994593911752e-01, - t2 = 6.46249402391333854778e-02, - t3 = -3.27885410759859649565e-02, - t4 = 1.79706750811820387126e-02, - t5 = -1.03142241298341437450e-02, - t6 = 6.10053870246291332635e-03, - t7 = -3.68452016781138256760e-03, - t8 = 2.25964780900612472250e-03, - t9 = -1.40346469989232843813e-03, - t10 = 8.81081882437654011382e-04, - t11 = -5.38595305356740546715e-04, - t12 = 3.15632070903625950361e-04, - t13 = -3.12754168375120860518e-04, - t14 = 3.35529192635519073543e-04, - u0 = -7.72156649015328655494e-02, - u1 = 6.32827064025093366517e-01, - u2 = 1.45492250137234768737e+00, - u3 = 9.77717527963372745603e-01, - u4 = 2.28963728064692451092e-01, - u5 = 1.33810918536787660377e-02, - v1 = 2.45597793713041134822e+00, - v2 = 2.12848976379893395361e+00, - v3 = 7.69285150456672783825e-01, - v4 = 1.04222645593369134254e-01, - v5 = 3.21709242282423911810e-03, - s0 = -7.72156649015328655494e-02, - s1 = 2.14982415960608852501e-01, - s2 = 3.25778796408930981787e-01, - s3 = 1.46350472652464452805e-01, - s4 = 2.66422703033638609560e-02, - s5 = 1.84028451407337715652e-03, - s6 = 3.19475326584100867617e-05, - r1 = 1.39200533467621045958e+00, - r2 = 7.21935547567138069525e-01, - r3 = 1.71933865632803078993e-01, - r4 = 1.86459191715652901344e-02, - r5 = 7.77942496381893596434e-04, - r6 = 7.32668430744625636189e-06, - w0 = 4.18938533204672725052e-01, - w1 = 8.33333333333329678849e-02, - w2 = -2.77777777728775536470e-03, - w3 = 7.93650558643019558500e-04, - w4 = -5.95187557450339963135e-04, - w5 = 8.36339918996282139126e-04, - w6 = -1.63092934096575273989e-03; + zero = 0.0, + one = 1.0, + half = .5, + a0 = 7.72156649015328655494e-02, + a1 = 3.22467033424113591611e-01, + a2 = 6.73523010531292681824e-02, + a3 = 2.05808084325167332806e-02, + a4 = 7.38555086081402883957e-03, + a5 = 2.89051383673415629091e-03, + a6 = 1.19270763183362067845e-03, + a7 = 5.10069792153511336608e-04, + a8 = 2.20862790713908385557e-04, + a9 = 1.08011567247583939954e-04, + a10 = 2.52144565451257326939e-05, + a11 = 4.48640949618915160150e-05, + tc = 1.46163214496836224576e+00, + tf = -1.21486290535849611461e-01, + tt = -3.63867699703950536541e-18, + t0 = 4.83836122723810047042e-01, + t1 = -1.47587722994593911752e-01, + t2 = 6.46249402391333854778e-02, + t3 = -3.27885410759859649565e-02, + t4 = 1.79706750811820387126e-02, + t5 = -1.03142241298341437450e-02, + t6 = 6.10053870246291332635e-03, + t7 = -3.68452016781138256760e-03, + t8 = 2.25964780900612472250e-03, + t9 = -1.40346469989232843813e-03, + t10 = 8.81081882437654011382e-04, + t11 = -5.38595305356740546715e-04, + t12 = 3.15632070903625950361e-04, + t13 = -3.12754168375120860518e-04, + t14 = 3.35529192635519073543e-04, + u0 = -7.72156649015328655494e-02, + u1 = 6.32827064025093366517e-01, + u2 = 1.45492250137234768737e+00, + u3 = 9.77717527963372745603e-01, + u4 = 2.28963728064692451092e-01, + u5 = 1.33810918536787660377e-02, + v1 = 2.45597793713041134822e+00, + v2 = 2.12848976379893395361e+00, + v3 = 7.69285150456672783825e-01, + v4 = 1.04222645593369134254e-01, + v5 = 3.21709242282423911810e-03, + s0 = -7.72156649015328655494e-02, + s1 = 2.14982415960608852501e-01, + s2 = 3.25778796408930981787e-01, + s3 = 1.46350472652464452805e-01, + s4 = 2.66422703033638609560e-02, + s5 = 1.84028451407337715652e-03, + s6 = 3.19475326584100867617e-05, + r1 = 1.39200533467621045958e+00, + r2 = 7.21935547567138069525e-01, + r3 = 1.71933865632803078993e-01, + r4 = 1.86459191715652901344e-02, + r5 = 7.77942496381893596434e-04, + r6 = 7.32668430744625636189e-06, + w0 = 4.18938533204672725052e-01, + w1 = 8.33333333333329678849e-02, + w2 = -2.77777777728775536470e-03, + w3 = 7.93650558643019558500e-04, + w4 = -5.95187557450339963135e-04, + w5 = 8.36339918996282139126e-04, + w6 = -1.63092934096575273989e-03; /** * Efficient rounding functions to simplify the log gamma function calculation - * double to long with 32 bit shift + * double to long with 32 bit shift */ - private static final int HI (double x) { - return (int)(Double.doubleToLongBits(x) >> 32); + private static final int HI(double x) { + return (int) (Double.doubleToLongBits(x) >> 32); } /** * Efficient rounding functions to simplify the log gamma function calculation - * double to long without shift + * double to long without shift */ - private static final int LO (double x) { - return (int)Double.doubleToLongBits(x); + private static final int LO(double x) { + return (int) Double.doubleToLongBits(x); } /** * Most efficent implementation of the lnGamma (FDLIBM) * Use via the log10Gamma wrapper method. */ - private static double lnGamma (double x) { - double t,y,z,p,p1,p2,p3,q,r,w; + private static double lnGamma(double x) { + double t, y, z, p, p1, p2, p3, q, r, w; int i; int hx = HI(x); int lx = LO(x); /* purge off +-inf, NaN, +-0, and negative arguments */ - int ix = hx&0x7fffffff; + int ix = hx & 0x7fffffff; if (ix >= 0x7ff00000) return Double.POSITIVE_INFINITY; - if ((ix|lx)==0 || hx < 0) return Double.NaN; - if (ix<0x3b900000) { /* |x|<2**-70, return -log(|x|) */ + if ((ix | lx) == 0 || hx < 0) return Double.NaN; + if (ix < 0x3b900000) { /* |x|<2**-70, return -log(|x|) */ return -Math.log(x); } /* purge off 1 and 2 */ - if((((ix-0x3ff00000)|lx)==0)||(((ix-0x40000000)|lx)==0)) r = 0; - /* for x < 2.0 */ - else if(ix<0x40000000) { - if(ix<=0x3feccccc) { /* lgamma(x) = lgamma(x+1)-log(x) */ + if ((((ix - 0x3ff00000) | lx) == 0) || (((ix - 0x40000000) | lx) == 0)) r = 0; + /* for x < 2.0 */ + else if (ix < 0x40000000) { + if (ix <= 0x3feccccc) { /* lgamma(x) = lgamma(x+1)-log(x) */ r = -Math.log(x); - if(ix>=0x3FE76944) {y = one-x; i= 0;} - else if(ix>=0x3FCDA661) {y= x-(tc-one); i=1;} - else {y = x; i=2;} + if (ix >= 0x3FE76944) { + y = one - x; + i = 0; + } else if (ix >= 0x3FCDA661) { + y = x - (tc - one); + i = 1; + } else { + y = x; + i = 2; + } } else { r = zero; - if(ix>=0x3FFBB4C3) {y=2.0-x;i=0;} /* [1.7316,2] */ - else if(ix>=0x3FF3B4C4) {y=x-tc;i=1;} /* [1.23,1.73] */ - else {y=x-one;i=2;} + if (ix >= 0x3FFBB4C3) { + y = 2.0 - x; + i = 0; + } /* [1.7316,2] */ else if (ix >= 0x3FF3B4C4) { + y = x - tc; + i = 1; + } /* [1.23,1.73] */ else { + y = x - one; + i = 2; + } } - switch(i) { - case 0: - z = y*y; - p1 = a0+z*(a2+z*(a4+z*(a6+z*(a8+z*a10)))); - p2 = z*(a1+z*(a3+z*(a5+z*(a7+z*(a9+z*a11))))); - p = y*p1+p2; - r += (p-0.5*y); break; - case 1: - z = y*y; - w = z*y; - p1 = t0+w*(t3+w*(t6+w*(t9 +w*t12))); /* parallel comp */ - p2 = t1+w*(t4+w*(t7+w*(t10+w*t13))); - p3 = t2+w*(t5+w*(t8+w*(t11+w*t14))); - p = z*p1-(tt-w*(p2+y*p3)); - r += (tf + p); break; - case 2: - p1 = y*(u0+y*(u1+y*(u2+y*(u3+y*(u4+y*u5))))); - p2 = one+y*(v1+y*(v2+y*(v3+y*(v4+y*v5)))); - r += (-0.5*y + p1/p2); + switch (i) { + case 0: + z = y * y; + p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10)))); + p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11))))); + p = y * p1 + p2; + r += (p - 0.5 * y); + break; + case 1: + z = y * y; + w = z * y; + p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12))); /* parallel comp */ + p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13))); + p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14))); + p = z * p1 - (tt - w * (p2 + y * p3)); + r += (tf + p); + break; + case 2: + p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5))))); + p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5)))); + r += (-0.5 * y + p1 / p2); } - } - else if(ix<0x40200000) { /* x < 8.0 */ - i = (int)x; + } else if (ix < 0x40200000) { /* x < 8.0 */ + i = (int) x; t = zero; - y = x-(double)i; - p = y*(s0+y*(s1+y*(s2+y*(s3+y*(s4+y*(s5+y*s6)))))); - q = one+y*(r1+y*(r2+y*(r3+y*(r4+y*(r5+y*r6))))); - r = half*y+p/q; - z = one; /* lgamma(1+s) = log(s) + lgamma(s) */ - switch(i) { - case 7: z *= (y+6.0); /* FALLTHRU */ - case 6: z *= (y+5.0); /* FALLTHRU */ - case 5: z *= (y+4.0); /* FALLTHRU */ - case 4: z *= (y+3.0); /* FALLTHRU */ - case 3: z *= (y+2.0); /* FALLTHRU */ - r += Math.log(z); break; + y = x - (double) i; + p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6)))))); + q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6))))); + r = half * y + p / q; + z = one; /* lgamma(1+s) = log(s) + lgamma(s) */ + switch (i) { + case 7: + z *= (y + 6.0); /* FALLTHRU */ + case 6: + z *= (y + 5.0); /* FALLTHRU */ + case 5: + z *= (y + 4.0); /* FALLTHRU */ + case 4: + z *= (y + 3.0); /* FALLTHRU */ + case 3: + z *= (y + 2.0); /* FALLTHRU */ + r += Math.log(z); + break; } /* 8.0 <= x < 2**58 */ } else if (ix < 0x43900000) { t = Math.log(x); - z = one/x; - y = z*z; - w = w0+z*(w1+y*(w2+y*(w3+y*(w4+y*(w5+y*w6))))); - r = (x-half)*(t-one)+w; + z = one / x; + y = z * z; + w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6))))); + r = (x - half) * (t - one) + w; } else /* 2**58 <= x <= inf */ - r = x*(Math.log(x)-one); + r = x * (Math.log(x) - one); return r; } @@ -1308,7 +1416,7 @@ public class MathUtils { * @param x the x parameter * @return the log10 of the gamma function at x. */ - public static double log10Gamma (double x) { + public static double log10Gamma(double x) { return lnToLog10(lnGamma(x)); } @@ -1320,13 +1428,13 @@ public class MathUtils { * @param k number of successes * @return the log10 of the binomial coefficient */ - public static double log10BinomialCoefficient (int n, int k) { - return log10Gamma(n+1) - log10Gamma(k+1) - log10Gamma(n-k+1); + public static double log10BinomialCoefficient(int n, int k) { + return log10Gamma(n + 1) - log10Gamma(k + 1) - log10Gamma(n - k + 1); } - public static double log10BinomialProbability (int n, int k, double log10p) { - double log10OneMinusP = Math.log10(1-Math.pow(10,log10p)); - return log10BinomialCoefficient(n, k) + log10p*k + log10OneMinusP*(n-k); + public static double log10BinomialProbability(int n, int k, double log10p) { + double log10OneMinusP = Math.log10(1 - Math.pow(10, log10p)); + return log10BinomialCoefficient(n, k) + log10p * k + log10OneMinusP * (n - k); } @@ -1338,38 +1446,74 @@ public class MathUtils { * @param k array of any size with the number of successes for each grouping (k1, k2, k3, ..., km) * @return */ - public static double log10MultinomialCoefficient (int n, int [] k) { + public static double log10MultinomialCoefficient(int n, int[] k) { double denominator = 0.0; for (int x : k) { - denominator += log10Gamma(x+1); + denominator += log10Gamma(x + 1); } - return log10Gamma(n+1) - denominator; + return log10Gamma(n + 1) - denominator; } /** * Computes the log10 of the multinomial distribution probability given a vector * of log10 probabilities. Designed to prevent overflows even with very large numbers. * - * @param n number of trials - * @param k array of number of successes for each possibility + * @param n number of trials + * @param k array of number of successes for each possibility * @param log10p array of log10 probabilities * @return */ - public static double log10MultinomialProbability (int n, int [] k, double [] log10p) { + public static double log10MultinomialProbability(int n, int[] k, double[] log10p) { if (log10p.length != k.length) throw new UserException.BadArgumentValue("p and k", "Array of log10 probabilities must have the same size as the array of number of sucesses: " + log10p.length + ", " + k.length); double log10Prod = 0.0; - for (int i=0; i reads = new ArrayList(); + private byte[] reference = null; + private final GenomeLoc loc; + private GenomeLoc referenceLoc = null; + private final GenomeLocParser genomeLocParser; + public final boolean isActive; + + public ActiveRegion( final GenomeLoc loc, final boolean isActive, final GenomeLocParser genomeLocParser ) { + this.loc = loc; + this.isActive = isActive; + this.genomeLocParser = genomeLocParser; + referenceLoc = loc; + } + + // add each read to the bin and extend the reference genome loc if needed + public void add( final GATKSAMRecord read, final boolean isPrimaryRegion ) { + referenceLoc = referenceLoc.union( genomeLocParser.createGenomeLoc( read ) ); + reads.add( new ActiveRead(read, isPrimaryRegion) ); + } + + public ArrayList getReads() { return reads; } + + public byte[] getReference( final IndexedFastaSequenceFile referenceReader ) { + // set up the reference if we haven't done so yet + if ( reference == null ) { + reference = referenceReader.getSubsequenceAt(referenceLoc.getContig(), referenceLoc.getStart(), referenceLoc.getStop()).getBases(); + } + + return reference; + } + + public GenomeLoc getLocation() { return loc; } + + public GenomeLoc getReferenceLocation() { return referenceLoc; } + + public int size() { return reads.size(); } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java index 921a0a599..fb133d902 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java @@ -70,27 +70,27 @@ public class ClippingOp { break; case SOFTCLIP_BASES: - if ( read.getReadUnmappedFlag() ) { + if (read.getReadUnmappedFlag()) { // we can't process unmapped reads throw new UserException("Read Clipper cannot soft clip unmapped reads"); } //System.out.printf("%d %d %d%n", stop, start, read.getReadLength()); int myStop = stop; - if ( (stop + 1 - start) == read.getReadLength() ) { + if ((stop + 1 - start) == read.getReadLength()) { // BAM representation issue -- we can't SOFTCLIP away all bases in a read, just leave it alone //Walker.logger.info(String.format("Warning, read %s has all bases clip but this can't be represented with SOFTCLIP_BASES, just leaving it alone", read.getReadName())); //break; myStop--; // just decrement stop } - if ( start > 0 && myStop != read.getReadLength() - 1 ) + if (start > 0 && myStop != read.getReadLength() - 1) throw new RuntimeException(String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d", read.getReadName(), start, myStop)); Cigar oldCigar = read.getCigar(); int scLeft = 0, scRight = read.getReadLength(); - if ( start == 0 ) + if (start == 0) scLeft = myStop + 1; else scRight = start; @@ -134,8 +134,7 @@ public class ClippingOp { unclippedCigar.add(new CigarElement(matchesCount, CigarOperator.MATCH_OR_MISMATCH)); matchesCount = 0; unclippedCigar.add(element); - } - else + } else unclippedCigar.add(element); } if (matchesCount > 0) @@ -284,10 +283,9 @@ public class ClippingOp { } @Requires({"start <= stop", "start == 0 || stop == read.getReadLength() - 1"}) - private GATKSAMRecord hardClip (GATKSAMRecord read, int start, int stop) { + private GATKSAMRecord hardClip(GATKSAMRecord read, int start, int stop) { if (start == 0 && stop == read.getReadLength() - 1) return GATKSAMRecord.emptyRead(read); -// return new GATKSAMRecord(read.getHeader()); // If the read is unmapped there is no Cigar string and neither should we create a new cigar string @@ -296,8 +294,8 @@ public class ClippingOp { // the cigar may force a shift left or right (or both) in case we are left with insertions // starting or ending the read after applying the hard clip on start/stop. int newLength = read.getReadLength() - (stop - start + 1) - cigarShift.shiftFromStart - cigarShift.shiftFromEnd; - byte [] newBases = new byte[newLength]; - byte [] newQuals = new byte[newLength]; + byte[] newBases = new byte[newLength]; + byte[] newQuals = new byte[newLength]; int copyStart = (start == 0) ? stop + 1 + cigarShift.shiftFromStart : cigarShift.shiftFromStart; System.arraycopy(read.getReadBases(), copyStart, newBases, 0, newLength); @@ -321,11 +319,11 @@ public class ClippingOp { } @Requires({"!cigar.isEmpty()"}) - private CigarShift hardClipCigar (Cigar cigar, int start, int stop) { + private CigarShift hardClipCigar(Cigar cigar, int start, int stop) { Cigar newCigar = new Cigar(); int index = 0; int totalHardClipCount = stop - start + 1; - int alignmentShift = 0; // caused by hard clipping insertions or deletions + int alignmentShift = 0; // caused by hard clipping deletions // hard clip the beginning of the cigar string if (start == 0) { @@ -353,7 +351,7 @@ public class ClippingOp { // element goes beyond what we need to clip else if (index + shift > stop + 1) { int elementLengthAfterChopping = cigarElement.getLength() - (stop - index + 1); - alignmentShift += calculateHardClippingAlignmentShift(cigarElement, stop-index+1); + alignmentShift += calculateHardClippingAlignmentShift(cigarElement, stop - index + 1); newCigar.add(new CigarElement(totalHardClipCount + alignmentShift, CigarOperator.HARD_CLIP)); newCigar.add(new CigarElement(elementLengthAfterChopping, cigarElement.getOperator())); } @@ -388,7 +386,7 @@ public class ClippingOp { if (index + shift < start) newCigar.add(new CigarElement(cigarElement.getLength(), cigarElement.getOperator())); - // element goes beyond our clip starting position + // element goes beyond our clip starting position else { int elementLengthAfterChopping = start - index; alignmentShift += calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength() - (start - index)); @@ -396,7 +394,7 @@ public class ClippingOp { // if this last element is a HARD CLIP operator, just merge it with our hard clip operator to be added later if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) totalHardClipCount += elementLengthAfterChopping; - // otherwise, maintain what's left of this last operator + // otherwise, maintain what's left of this last operator else newCigar.add(new CigarElement(elementLengthAfterChopping, cigarElement.getOperator())); } @@ -408,7 +406,7 @@ public class ClippingOp { } // check if we are hard clipping indels - while(cigarElementIterator.hasNext()) { + while (cigarElementIterator.hasNext()) { cigarElement = cigarElementIterator.next(); alignmentShift += calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength()); @@ -444,34 +442,30 @@ public class ClippingOp { boolean readHasStarted = false; boolean addedHardClips = false; - while(!cigarStack.empty()) { + while (!cigarStack.empty()) { CigarElement cigarElement = cigarStack.pop(); - if ( !readHasStarted && - cigarElement.getOperator() != CigarOperator.INSERTION && + if (!readHasStarted && +// cigarElement.getOperator() != CigarOperator.INSERTION && cigarElement.getOperator() != CigarOperator.DELETION && cigarElement.getOperator() != CigarOperator.HARD_CLIP) readHasStarted = true; - else if ( !readHasStarted && cigarElement.getOperator() == CigarOperator.HARD_CLIP) + else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.HARD_CLIP) totalHardClip += cigarElement.getLength(); - else if ( !readHasStarted && cigarElement.getOperator() == CigarOperator.INSERTION) - shift += cigarElement.getLength(); - - else if ( !readHasStarted && cigarElement.getOperator() == CigarOperator.DELETION) + else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.DELETION) totalHardClip += cigarElement.getLength(); if (readHasStarted) { - if (i==1) { + if (i == 1) { if (!addedHardClips) { if (totalHardClip > 0) inverseCigarStack.push(new CigarElement(totalHardClip, CigarOperator.HARD_CLIP)); addedHardClips = true; } inverseCigarStack.push(cigarElement); - } - else { + } else { if (!addedHardClips) { if (totalHardClip > 0) cleanCigar.add(new CigarElement(totalHardClip, CigarOperator.HARD_CLIP)); @@ -498,7 +492,7 @@ public class ClippingOp { int newShift = 0; int oldShift = 0; - boolean readHasStarted = false; // if the new cigar is composed of S and H only, we have to traverse the entire old cigar to calculate the shift + boolean readHasStarted = false; // if the new cigar is composed of S and H only, we have to traverse the entire old cigar to calculate the shift for (CigarElement cigarElement : newCigar.getCigarElements()) { if (cigarElement.getOperator() == CigarOperator.HARD_CLIP || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) newShift += cigarElement.getLength(); @@ -509,7 +503,7 @@ public class ClippingOp { } for (CigarElement cigarElement : oldCigar.getCigarElements()) { - if (cigarElement.getOperator() == CigarOperator.HARD_CLIP || cigarElement.getOperator() == CigarOperator.SOFT_CLIP ) + if (cigarElement.getOperator() == CigarOperator.HARD_CLIP || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) oldShift += cigarElement.getLength(); else if (readHasStarted) break; @@ -522,7 +516,7 @@ public class ClippingOp { if (cigarElement.getOperator() == CigarOperator.INSERTION) return -clippedLength; - // Deletions should be added to the total hard clip count + // Deletions should be added to the total hard clip count else if (cigarElement.getOperator() == CigarOperator.DELETION) return cigarElement.getLength(); diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java index afe7fa975..7a664bd61 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java @@ -374,24 +374,43 @@ public class ReadClipper { * Generic functionality to hard clip a read, used internally by hardClipByReferenceCoordinatesLeftTail * and hardClipByReferenceCoordinatesRightTail. Should not be used directly. * + * Note, it REQUIRES you to give the directionality of your hard clip (i.e. whether you're clipping the + * left of right tail) by specifying either refStart < 0 or refStop < 0. + * * @param refStart first base to clip (inclusive) * @param refStop last base to clip (inclusive) * @return a new read, without the clipped bases */ - @Requires("!read.getReadUnmappedFlag()") // can't handle unmapped reads, as we're using reference coordinates to clip + @Requires({"!read.getReadUnmappedFlag()", "refStart < 0 || refStop < 0"}) // can't handle unmapped reads, as we're using reference coordinates to clip protected GATKSAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) { - int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL); - int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL); + if (read.isEmpty()) + return read; - if (read.isEmpty() || (start == 0 && stop == read.getReadLength() - 1)) - return GATKSAMRecord.emptyRead(read); -// return new GATKSAMRecord(read.getHeader()); + int start; + int stop; + + // Determine the read coordinate to start and stop hard clipping + if (refStart < 0) { + if (refStop < 0) + throw new ReviewedStingException("Only one of refStart or refStop must be < 0, not both (" + refStart + ", " + refStop + ")"); + start = 0; + stop = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL); + } + else { + if (refStop >= 0) + throw new ReviewedStingException("Either refStart or refStop must be < 0 (" + refStart + ", " + refStop + ")"); + start = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL); + stop = read.getReadLength() - 1; + } + +// if ((start == 0 && stop == read.getReadLength() - 1)) +// return GATKSAMRecord.emptyRead(read); if (start < 0 || stop > read.getReadLength() - 1) throw new ReviewedStingException("Trying to clip before the start or after the end of a read"); if ( start > stop ) - throw new ReviewedStingException("START > STOP -- this should never happen -- call Mauricio!"); + throw new ReviewedStingException(String.format("START (%d) > (%d) STOP -- this should never happen -- call Mauricio!", start, stop)); if ( start > 0 && stop < read.getReadLength() - 1) throw new ReviewedStingException(String.format("Trying to clip the middle of the read: start %d, stop %d, cigar: %s", start, stop, read.getCigarString())); diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index 18051ce92..586b86490 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -526,6 +526,42 @@ public abstract class AbstractReadBackedPileup rgSet) { + if(pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + for(final String sample: tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupForReadGroups(rgSet); + if(pileup != null) + filteredTracker.addElements(sample,pileup.pileupElementTracker); + } + return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null; + } + else { + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + for(PE p: pileupElementTracker) { + GATKSAMRecord read = p.getRead(); + if(rgSet != null && !rgSet.isEmpty()) { + if(read.getReadGroup() != null && rgSet.contains(read.getReadGroup().getReadGroupId())) + filteredTracker.add(p); + } + else { + if(read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) + filteredTracker.add(p); + } + } + return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null; + } + } + @Override public RBP getPileupForLane(String laneID) { if(pileupElementTracker instanceof PerSamplePileupElementTracker) { diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java index 02767df7c..ccd9d509f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java @@ -30,6 +30,7 @@ import org.broadinstitute.sting.utils.fragments.FragmentCollection; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.Collection; +import java.util.HashSet; import java.util.List; /** @@ -129,6 +130,13 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca */ public ReadBackedPileup getPileupForReadGroup(String readGroupId); + /** + * Gets all the reads associated with a given read groups. + * @param rgSet Set of identifiers for the read group. + * @return A pileup containing only the reads in the given read groups. + */ + public ReadBackedPileup getPileupForReadGroups(final HashSet rgSet); + /** * Gets all reads in a given lane id. (Lane ID is the read group * id stripped of the last .XX sample identifier added by the GATK). diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index cedd56bdf..542adea77 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -238,7 +238,7 @@ public class ArtificialSAMUtils { */ public static GATKSAMRecord createArtificialRead( byte[] bases, byte[] qual, String cigar ) { SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 1, bases, qual, cigar); + return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, qual, cigar); } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 96713edc2..913548ecc 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -24,7 +24,6 @@ package org.broadinstitute.sting.utils.sam; -import com.google.java.contract.Ensures; import net.sf.samtools.*; import org.broadinstitute.sting.utils.NGSPlatform; @@ -184,6 +183,12 @@ public class GATKSAMRecord extends BAMRecord { return getReducedReadCounts() != null; } + /** + * The number of bases corresponding the i'th base of the reduced read. + * + * @param i the read based coordinate inside the read + * @return the number of bases corresponding to the i'th base of the reduced read + */ public final byte getReducedCount(final int i) { byte firstCount = getReducedReadCounts()[0]; byte offsetCount = getReducedReadCounts()[i]; @@ -277,7 +282,6 @@ public class GATKSAMRecord extends BAMRecord { * * @return the unclipped start of the read taking soft clips (but not hard clips) into account */ - @Ensures({"result >= getUnclippedStart()", "result <= getUnclippedEnd() || ReadUtils.readIsEntirelyInsertion(this)"}) public int getSoftStart() { int start = this.getUnclippedStart(); for (CigarElement cigarElement : this.getCigar().getCigarElements()) { @@ -286,17 +290,17 @@ public class GATKSAMRecord extends BAMRecord { else break; } + return start; } /** * Calculates the reference coordinate for the end of the read taking into account soft clips but not hard clips. * - * Note: getUnclippedStart() adds soft and hard clips, this function only adds soft clips. + * Note: getUnclippedEnd() adds soft and hard clips, this function only adds soft clips. * * @return the unclipped end of the read taking soft clips (but not hard clips) into account */ - @Ensures({"result >= getUnclippedStart()", "result <= getUnclippedEnd() || ReadUtils.readIsEntirelyInsertion(this)"}) public int getSoftEnd() { int stop = this.getUnclippedStart(); @@ -313,6 +317,7 @@ public class GATKSAMRecord extends BAMRecord { else shift = 0; } + return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ; } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index f2e54713f..7fa2f6230 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -29,6 +29,7 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.samtools.*; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -58,7 +59,7 @@ public class ReadUtils { /** * A HashMap of the SAM spec read flag names - *

+ * * Note: This is not being used right now, but can be useful in the future */ private static final Map readFlagNames = new HashMap(); @@ -79,49 +80,47 @@ public class ReadUtils { /** * This enum represents all the different ways in which a read can overlap an interval. - *

+ * * NO_OVERLAP_CONTIG: * read and interval are in different contigs. - *

+ * * NO_OVERLAP_LEFT: * the read does not overlap the interval. - *

- * |----------------| (interval) - * <----------------> (read) - *

+ * + * |----------------| (interval) + * <----------------> (read) + * * NO_OVERLAP_RIGHT: * the read does not overlap the interval. - *

- * |----------------| (interval) - * <----------------> (read) - *

+ * + * |----------------| (interval) + * <----------------> (read) + * * OVERLAP_LEFT: * the read starts before the beginning of the interval but ends inside of it - *

- * |----------------| (interval) - * <----------------> (read) - *

+ * + * |----------------| (interval) + * <----------------> (read) + * * OVERLAP_RIGHT: * the read starts inside the interval but ends outside of it - *

- * |----------------| (interval) - * <----------------> (read) - *

+ * + * |----------------| (interval) + * <----------------> (read) + * * OVERLAP_LEFT_AND_RIGHT: * the read starts before the interval and ends after the interval - *

- * |-----------| (interval) - * <-------------------> (read) - *

+ * + * |-----------| (interval) + * <-------------------> (read) + * * OVERLAP_CONTAINED: * the read starts and ends inside the interval - *

- * |----------------| (interval) - * <--------> (read) + * + * |----------------| (interval) + * <--------> (read) */ - public enum ReadAndIntervalOverlap { - NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, NO_OVERLAP_HARDCLIPPED_LEFT, NO_OVERLAP_HARDCLIPPED_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED - } + public enum ReadAndIntervalOverlap {NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, NO_OVERLAP_HARDCLIPPED_LEFT, NO_OVERLAP_HARDCLIPPED_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED} /** * Creates a SAMFileWriter with the given compression level if you request a bam file. Creates a regular @@ -141,15 +140,15 @@ public class ReadUtils { /** * is this base inside the adaptor of the read? - *

+ * * There are two cases to treat here: - *

+ * * 1) Read is in the negative strand => Adaptor boundary is on the left tail * 2) Read is in the positive strand => Adaptor boundary is on the right tail - *

+ * * Note: We return false to all reads that are UNMAPPED or have an weird big insert size (probably due to mismapping or bigger event) * - * @param read the read to test + * @param read the read to test * @param basePos base position in REFERENCE coordinates (not read coordinates) * @return whether or not the base is in the adaptor */ @@ -166,22 +165,22 @@ public class ReadUtils { * the read boundary. If the read is in the positive strand, this is the first base after the end of the * fragment (Picard calls it 'insert'), if the read is in the negative strand, this is the first base before the * beginning of the fragment. - *

+ * * There are two cases we need to treat here: - *

+ * * 1) Our read is in the reverse strand : - *

- * <----------------------| * - * |---------------------> - *

- * in these cases, the adaptor boundary is at the mate start (minus one) - *

+ * + * <----------------------| * + * |---------------------> + * + * in these cases, the adaptor boundary is at the mate start (minus one) + * * 2) Our read is in the forward strand : - *

- * |----------------------> * - * <----------------------| - *

- * in these cases the adaptor boundary is at the start of the read plus the inferred insert size (plus one) + * + * |----------------------> * + * <----------------------| + * + * in these cases the adaptor boundary is at the start of the read plus the inferred insert size (plus one) * * @param read the read being tested for the adaptor boundary * @return the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read. NULL if the read is unmapped or the mate is mapped to another contig. @@ -264,7 +263,7 @@ public class ReadUtils { /** * If a read starts in INSERTION, returns the first element length. - *

+ * * Warning: If the read has Hard or Soft clips before the insertion this function will return 0. * * @param read @@ -272,7 +271,7 @@ public class ReadUtils { */ public final static int getFirstInsertionOffset(SAMRecord read) { CigarElement e = read.getCigar().getCigarElement(0); - if (e.getOperator() == CigarOperator.I) + if ( e.getOperator() == CigarOperator.I ) return e.getLength(); else return 0; @@ -280,7 +279,7 @@ public class ReadUtils { /** * If a read ends in INSERTION, returns the last element length. - *

+ * * Warning: If the read has Hard or Soft clips after the insertion this function will return 0. * * @param read @@ -288,7 +287,7 @@ public class ReadUtils { */ public final static int getLastInsertionOffset(SAMRecord read) { CigarElement e = read.getCigar().getCigarElement(read.getCigarLength() - 1); - if (e.getOperator() == CigarOperator.I) + if ( e.getOperator() == CigarOperator.I ) return e.getLength(); else return 0; @@ -297,8 +296,7 @@ public class ReadUtils { /** * Determines what is the position of the read in relation to the interval. * Note: This function uses the UNCLIPPED ENDS of the reads for the comparison. - * - * @param read the read + * @param read the read * @param interval the interval * @return the overlap type as described by ReadAndIntervalOverlap enum (see above) */ @@ -309,30 +307,30 @@ public class ReadUtils { int uStart = read.getUnclippedStart(); int uStop = read.getUnclippedEnd(); - if (!read.getReferenceName().equals(interval.getContig())) + if ( !read.getReferenceName().equals(interval.getContig()) ) return ReadAndIntervalOverlap.NO_OVERLAP_CONTIG; - else if (uStop < interval.getStart()) + else if ( uStop < interval.getStart() ) return ReadAndIntervalOverlap.NO_OVERLAP_LEFT; - else if (uStart > interval.getStop()) + else if ( uStart > interval.getStop() ) return ReadAndIntervalOverlap.NO_OVERLAP_RIGHT; - else if (sStop < interval.getStart()) + else if ( sStop < interval.getStart() ) return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_LEFT; - else if (sStart > interval.getStop()) + else if ( sStart > interval.getStop() ) return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_RIGHT; - else if ((sStart >= interval.getStart()) && - (sStop <= interval.getStop())) + else if ( (sStart >= interval.getStart()) && + (sStop <= interval.getStop()) ) return ReadAndIntervalOverlap.OVERLAP_CONTAINED; - else if ((sStart < interval.getStart()) && - (sStop > interval.getStop())) + else if ( (sStart < interval.getStart()) && + (sStop > interval.getStop()) ) return ReadAndIntervalOverlap.OVERLAP_LEFT_AND_RIGHT; - else if ((sStart < interval.getStart())) + else if ( (sStart < interval.getStart()) ) return ReadAndIntervalOverlap.OVERLAP_LEFT; else @@ -340,36 +338,52 @@ public class ReadUtils { } /** - * Pre-processes the results of getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int) in case it falls in - * a deletion following the typical clipping needs. If clipping the left tail (beginning of the read) returns - * the base prior to the deletion. If clipping the right tail (end of the read) returns the base after the - * deletion. + * Pre-processes the results of getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int) to take care of + * two corner cases: + * + * 1. If clipping the right tail (end of the read) getReadCoordinateForReferenceCoordinate and fall inside + * a deletion return the base after the deletion. If clipping the left tail (beginning of the read) it + * doesn't matter because it already returns the previous base by default. + * + * 2. If clipping the left tail (beginning of the read) getReadCoordinateForReferenceCoordinate and the + * read starts with an insertion, and you're requesting the first read based coordinate, it will skip + * the leading insertion (because it has the same reference coordinate as the following base). * * @param read * @param refCoord * @param tail * @return the read coordinate corresponding to the requested reference coordinate for clipping. */ - @Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd()"}) + @Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd() || (read.getUnclippedEnd() < read.getUnclippedStart())"}) @Ensures({"result >= 0", "result < read.getReadLength()"}) public static int getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord, ClippingTail tail) { Pair result = getReadCoordinateForReferenceCoordinate(read, refCoord); int readCoord = result.getFirst(); + // Corner case one: clipping the right tail and falls on deletion, move to the next + // read coordinate. It is not a problem for the left tail because the default answer + // from getReadCoordinateForReferenceCoordinate is to give the previous read coordinate. if (result.getSecond() && tail == ClippingTail.RIGHT_TAIL) readCoord++; + // clipping the left tail and first base is insertion, go to the next read coordinate + // with the same reference coordinate. Advance to the next cigar element, or to the + // end of the read if there is no next element. + Pair firstElementIsInsertion = readStartsWithInsertion(read); + if (readCoord == 0 && tail == ClippingTail.LEFT_TAIL && firstElementIsInsertion.getFirst()) + readCoord = Math.min(firstElementIsInsertion.getSecond().getLength(), read.getReadLength() - 1); + return readCoord; } /** * Returns the read coordinate corresponding to the requested reference coordinate. - *

+ * * WARNING: if the requested reference coordinate happens to fall inside a deletion in the read, this function * will return the last read base before the deletion. This function returns a * Pair(int readCoord, boolean fallsInsideDeletion) so you can choose which readCoordinate to use when faced with * a deletion. - *

+ * * SUGGESTION: Use getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int, ClippingTail) instead to get a * pre-processed result according to normal clipping needs. Or you can use this function and tailor the * behavior to your needs. @@ -421,7 +435,7 @@ public class ReadUtils { if (endsWithinCigar) fallsInsideDeletion = cigarElement.getOperator() == CigarOperator.DELETION; - // if we end outside the current cigar element, we need to check if the next element is an insertion or deletion. + // if we end outside the current cigar element, we need to check if the next element is an insertion or deletion. else { nextCigarElement = cigarElementIterator.next(); @@ -442,13 +456,13 @@ public class ReadUtils { if (!fallsInsideDeletion && cigarElement.getOperator().consumesReadBases()) readBases += shift; - // If we reached our goal inside a deletion, but the deletion is the next cigar element then we need - // to add the shift of the current cigar element but go back to it's last element to return the last - // base before the deletion (see warning in function contracts) + // If we reached our goal inside a deletion, but the deletion is the next cigar element then we need + // to add the shift of the current cigar element but go back to it's last element to return the last + // base before the deletion (see warning in function contracts) else if (fallsInsideDeletion && !endsWithinCigar) readBases += shift - 1; - // If we reached our goal inside a deletion then we must backtrack to the last base before the deletion + // If we reached our goal inside a deletion then we must backtrack to the last base before the deletion else if (fallsInsideDeletion && endsWithinCigar) readBases--; } @@ -457,7 +471,6 @@ public class ReadUtils { if (!goalReached) throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?"); - return new Pair(readBases, fallsInsideDeletion); } @@ -465,12 +478,11 @@ public class ReadUtils { * Compares two SAMRecords only the basis on alignment start. Note that * comparisons are performed ONLY on the basis of alignment start; any * two SAM records with the same alignment start will be considered equal. - *

+ * * Unmapped alignments will all be considered equal. */ @Requires({"read1 != null", "read2 != null"}) - @Ensures("result == 0 || result == 1 || result == -1") public static int compareSAMRecords(GATKSAMRecord read1, GATKSAMRecord read2) { AlignmentStartComparator comp = new AlignmentStartComparator(); return comp.compare(read1, read2); @@ -479,7 +491,7 @@ public class ReadUtils { /** * Is a base inside a read? * - * @param read the read to evaluate + * @param read the read to evaluate * @param referenceCoordinate the reference coordinate of the base to test * @return true if it is inside the read, false otherwise. */ @@ -502,4 +514,134 @@ public class ReadUtils { } + /** + * Checks if a read starts with an insertion. It looks beyond Hard and Soft clips + * if there are any. + * + * @param read + * @return A pair with the answer (true/false) and the element or null if it doesn't exist + */ + public static Pair readStartsWithInsertion(GATKSAMRecord read) { + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + if (cigarElement.getOperator() == CigarOperator.INSERTION) + return new Pair(true, cigarElement); + + else if (cigarElement.getOperator() != CigarOperator.HARD_CLIP && cigarElement.getOperator() != CigarOperator.SOFT_CLIP) + break; + } + return new Pair(false, null); + } + + /** + * Returns the coverage distribution of a list of reads within the desired region. + * + * See getCoverageDistributionOfRead for information on how the coverage is calculated. + * + * @param list the list of reads covering the region + * @param startLocation the first reference coordinate of the region (inclusive) + * @param stopLocation the last reference coordinate of the region (inclusive) + * @return an array with the coverage of each position from startLocation to stopLocation + */ + public static int [] getCoverageDistributionOfReads(List list, int startLocation, int stopLocation) { + int [] totalCoverage = new int[stopLocation - startLocation + 1]; + + for (GATKSAMRecord read : list) { + int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation); + totalCoverage = MathUtils.addArrays(totalCoverage, readCoverage); + } + + return totalCoverage; + } + + /** + * Returns the coverage distribution of a single read within the desired region. + * + * Note: This function counts DELETIONS as coverage (since the main purpose is to downsample + * reads for variant regions, and deletions count as variants) + * + * @param read the read to get the coverage distribution of + * @param startLocation the first reference coordinate of the region (inclusive) + * @param stopLocation the last reference coordinate of the region (inclusive) + * @return an array with the coverage of each position from startLocation to stopLocation + */ + public static int [] getCoverageDistributionOfRead(GATKSAMRecord read, int startLocation, int stopLocation) { + int [] coverage = new int[stopLocation - startLocation + 1]; + int refLocation = read.getSoftStart(); + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + switch (cigarElement.getOperator()) { + case S: + case M: + case EQ: + case N: + case X: + case D: + for (int i = 0; i < cigarElement.getLength(); i++) { + if (refLocation >= startLocation && refLocation <= stopLocation) { + int baseCount = read.isReducedRead() ? read.getReducedCount(refLocation - read.getSoftStart()) : 1; + coverage[refLocation - startLocation] += baseCount; // this may be a reduced read, so add the proper number of bases + } + refLocation++; + } + break; + + case P: + case I: + case H: + break; + } + + if (refLocation > stopLocation) + break; + } + return coverage; + } + + /** + * Makes association maps for the reads and loci coverage as described below : + * + * - First: locusToReadMap -- a HashMap that describes for each locus, which reads contribute to its coverage. + * Note: Locus is in reference coordinates. + * Example: Locus => {read1, read2, ..., readN} + * + * - Second: readToLocusMap -- a HashMap that describes for each read what loci it contributes to the coverage. + * Note: Locus is a boolean array, indexed from 0 (= startLocation) to N (= stopLocation), with true meaning it contributes to the coverage. + * Example: Read => {true, true, false, ... false} + * + * @param readList the list of reads to generate the association mappings + * @param startLocation the first reference coordinate of the region (inclusive) + * @param stopLocation the last reference coordinate of the region (inclusive) + * @return the two hashmaps described above + */ + public static Pair> , HashMap> getBothReadToLociMappings (List readList, int startLocation, int stopLocation) { + int arraySize = stopLocation - startLocation + 1; + + HashMap> locusToReadMap = new HashMap>(2*(stopLocation - startLocation + 1), 0.5f); + HashMap readToLocusMap = new HashMap(2*readList.size(), 0.5f); + + + for (int i = startLocation; i <= stopLocation; i++) + locusToReadMap.put(i, new HashSet()); // Initialize the locusToRead map with empty lists + + for (GATKSAMRecord read : readList) { + readToLocusMap.put(read, new Boolean[arraySize]); // Initialize the readToLocus map with empty arrays + + int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation); + + for (int i=0; i 0) { + // Update the hash for this locus + HashSet readSet = locusToReadMap.get(refLocation); + readSet.add(read); + + // Add this locus to the read hash + readToLocusMap.get(read)[refLocation - startLocation] = true; + } + else + // Update the boolean array with a 'no coverage' from this read to this locus + readToLocusMap.get(read)[refLocation-startLocation] = false; + } + } + return new Pair>, HashMap>(locusToReadMap, readToLocusMap); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 8b101d1d5..0aec94663 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -145,19 +145,19 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { } } - @Test(enabled = false) + @Test public void testSnpEffAnnotations() { WalkerTestSpec spec = new WalkerTestSpec( "-T VariantAnnotator -R " + hg19Reference + " -NO_HEADER -o %s -A SnpEff --variant " + validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation + - "snpEff2.0.4.AFR.unfiltered.vcf -L 1:1-1,500,000 -L 2:232,325,429", + "snpEff2.0.5.AFR.unfiltered.vcf -L 1:1-1,500,000 -L 2:232,325,429", 1, - Arrays.asList("51258f5c880bd1ca3eb45a1711335c66") + Arrays.asList("ffbda45b3682c9b83cb541d83f6c15d6") ); executeTest("Testing SnpEff annotations", spec); } - @Test(enabled = false) + @Test public void testSnpEffAnnotationsUnsupportedVersion() { WalkerTestSpec spec = new WalkerTestSpec( "-T VariantAnnotator -R " + hg19Reference + " -NO_HEADER -o %s -A SnpEff --variant " + diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index f7d6af3a7..a91ea1c87 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -129,8 +129,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testOutputParameter() { HashMap e = new HashMap(); e.put( "-sites_only", "44f3b5b40e6ad44486cddfdb7e0bfcd8" ); - e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "42e4ea7878ef8d96215accb3ba4e97b7" ); - e.put( "--output_mode EMIT_ALL_SITES", "e0443c720149647469f2a2f3fb73942f" ); + e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "553f6b4cbf380885bec9dd634cf68742" ); + e.put( "--output_mode EMIT_ALL_SITES", "6d8624e45ad9dae5803ac705b39e4ffa" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -265,7 +265,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("fa4f3ee67d98b64102a8a3ec81a3bc81")); + Arrays.asList("c60a44ba94a80a0cb1fba8b6f90a13cd")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1); } @@ -275,7 +275,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("df90890e43d735573a3b3e4f289ca46b")); + Arrays.asList("36ce53ae4319718ad9c8ae391deebc8c")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); } @@ -285,7 +285,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1, - Arrays.asList("cff6dd0f4eb1ef0b6fc476da6ffead19")); + Arrays.asList("d356cbaf240d7025d1aecdabaff3a3e0")); executeTest("test MultiSample Pilot2 indels with complicated records", spec3); } @@ -294,11 +294,11 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, - Arrays.asList("1e2a4aab26e9ab0dae709d33a669e036")); + Arrays.asList("947c12ef2a8c29ae787cd11be8c565c8")); executeTest("test MultiSample Phase1 indels with complicated records", spec4); } - @Test(enabled = false) + @Test public void testSnpEffAnnotationRequestedWithoutRodBinding() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000 " + diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java index 3976231ef..65de6697b 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java @@ -118,6 +118,9 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { Arrays.asList(md5)); executeTest("testTableRecalibrator1", spec); } + else { + throw new IllegalStateException("testTableRecalibrator1: paramsFile was null"); + } } @Test @@ -144,7 +147,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { } } - @Test + @Test(dependsOnMethods = "testCountCovariates1") public void testTableRecalibratorMaxQ70() { HashMap e = new HashMap(); e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "0b7123ae9f4155484b68e4a4f96c5504" ); @@ -170,6 +173,9 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { Arrays.asList(md5)); executeTest("testTableRecalibratorMaxQ70", spec); } + else { + throw new IllegalStateException("testTableRecalibratorMaxQ70: paramsFile was null"); + } } } @@ -199,7 +205,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { } } - @Test + @Test(dependsOnMethods = "testCountCovariatesSolidIndelsRemoveRefBias") public void testTableRecalibratorSolidIndelsRemoveRefBias() { HashMap e = new HashMap(); e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "2ad4c17ac3ed380071137e4e53a398a5" ); @@ -224,6 +230,9 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { Arrays.asList(md5)); executeTest("testTableRecalibratorSolidIndelsRemoveRefBias", spec); } + else { + throw new IllegalStateException("testTableRecalibratorSolidIndelsRemoveRefBias: paramsFile was null"); + } } } @@ -305,7 +314,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { } } - @Test + @Test(dependsOnMethods = "testCountCovariatesNoIndex") public void testTableRecalibratorNoIndex() { HashMap e = new HashMap(); e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "991f093a0e610df235d28ada418ebf33" ); @@ -329,6 +338,9 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { Arrays.asList(md5)); executeTest("testTableRecalibratorNoIndex", spec); } + else { + throw new IllegalStateException("testTableRecalibratorNoIndex: paramsFile was null"); + } } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 3ef4e5e9f..5c3a43c96 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -14,19 +14,19 @@ public class VariantEvalIntegrationTest extends WalkerTest { private static String cmdRoot = "-T VariantEval" + " -R " + b36KGReference; - @Test(enabled = false) + @Test public void testFunctionClassWithSnpeff() { WalkerTestSpec spec = new WalkerTestSpec( buildCommandLine( "-T VariantEval", "-R " + b37KGReference, "--dbsnp " + b37dbSNP132, - "--eval " + validationDataLocation + "snpEff2.0.4.AFR.unfiltered.VariantAnnotator.output.vcf", + "--eval " + validationDataLocation + "snpEff2.0.5.AFR.unfiltered.VariantAnnotator.output.vcf", "-noEV", "-EV TiTvVariantEvaluator", "-noST", "-ST FunctionalClass", - "-L " + validationDataLocation + "snpEff2.0.4.AFR.unfiltered.VariantAnnotator.output.vcf", + "-L " + validationDataLocation + "snpEff2.0.5.AFR.unfiltered.VariantAnnotator.output.vcf", "-o %s" ), 1, @@ -450,4 +450,21 @@ public class VariantEvalIntegrationTest extends WalkerTest { ); executeTest("testIntervalStrat", spec); } + + @Test + public void testModernVCFWithLargeIndels() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "-eval " + validationDataLocation + "/NA12878.HiSeq.WGS.b37_decoy.indel.recalibrated.vcf", + "-L 20", + "-D " + b37dbSNP132, + "-o %s" + ), + 1, + Arrays.asList("a6f8b32fa732632da13dfe3ddcc73cef") + ); + executeTest("testModernVCFWithLargeIndels", spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java index 251c105c3..049bdce3e 100755 --- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java @@ -26,22 +26,20 @@ package org.broadinstitute.sting.utils; +import org.broadinstitute.sting.BaseTest; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; -import org.broadinstitute.sting.BaseTest; - -import java.util.List; -import java.util.ArrayList; -import java.util.Collections; +import java.util.*; /** * Basic unit test for MathUtils */ public class MathUtilsUnitTest extends BaseTest { @BeforeClass - public void init() { } + public void init() { + } /** * Tests that we get the right values from the binomial distribution @@ -66,20 +64,20 @@ public class MathUtilsUnitTest extends BaseTest { public void testMultinomialProbability() { logger.warn("Executing testMultinomialProbability"); - int[] counts0 = { 2, 0, 1 }; - double[] probs0 = { 0.33, 0.33, 0.34 }; + int[] counts0 = {2, 0, 1}; + double[] probs0 = {0.33, 0.33, 0.34}; Assert.assertEquals(MathUtils.multinomialProbability(counts0, probs0), 0.111078, 1e-6); - int[] counts1 = { 10, 20, 30 }; - double[] probs1 = { 0.25, 0.25, 0.50 }; + int[] counts1 = {10, 20, 30}; + double[] probs1 = {0.25, 0.25, 0.50}; Assert.assertEquals(MathUtils.multinomialProbability(counts1, probs1), 0.002870301, 1e-9); - int[] counts2 = { 38, 82, 50, 36 }; - double[] probs2 = { 0.25, 0.25, 0.25, 0.25 }; + int[] counts2 = {38, 82, 50, 36}; + double[] probs2 = {0.25, 0.25, 0.25, 0.25}; Assert.assertEquals(MathUtils.multinomialProbability(counts2, probs2), 1.88221e-09, 1e-10); - int[] counts3 = { 1, 600, 1 }; - double[] probs3 = { 0.33, 0.33, 0.34 }; + int[] counts3 = {1, 600, 1}; + double[] probs3 = {0.33, 0.33, 0.34}; Assert.assertEquals(MathUtils.multinomialProbability(counts3, probs3), 5.20988e-285, 1e-286); } @@ -123,19 +121,21 @@ public class MathUtilsUnitTest extends BaseTest { Assert.assertTrue(FiveAlpha.containsAll(BigFiveAlpha)); } - /** Tests that we correctly compute mean and standard deviation from a stream of numbers */ + /** + * Tests that we correctly compute mean and standard deviation from a stream of numbers + */ @Test public void testRunningAverage() { logger.warn("Executing testRunningAverage"); - int [] numbers = {1,2,4,5,3,128,25678,-24}; + int[] numbers = {1, 2, 4, 5, 3, 128, 25678, -24}; MathUtils.RunningAverage r = new MathUtils.RunningAverage(); - for ( int i = 0 ; i < numbers.length ; i++ ) r.add((double)numbers[i]); + for (int i = 0; i < numbers.length; i++) r.add((double) numbers[i]); - Assert.assertEquals((long)numbers.length, r.observationCount()); - Assert.assertTrue(r.mean()- 3224.625 < 2e-10 ); - Assert.assertTrue(r.stddev()-9072.6515881128 < 2e-10); + Assert.assertEquals((long) numbers.length, r.observationCount()); + Assert.assertTrue(r.mean() - 3224.625 < 2e-10); + Assert.assertTrue(r.stddev() - 9072.6515881128 < 2e-10); } @Test @@ -174,4 +174,56 @@ public class MathUtilsUnitTest extends BaseTest { Assert.assertEquals(MathUtils.log10Factorial(12342), 45138.26, 1e-1); } + @Test(enabled = true) + public void testRandomSubset() { + Integer[] x = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + Assert.assertEquals(MathUtils.randomSubset(x, 0).length, 0); + Assert.assertEquals(MathUtils.randomSubset(x, 1).length, 1); + Assert.assertEquals(MathUtils.randomSubset(x, 2).length, 2); + Assert.assertEquals(MathUtils.randomSubset(x, 3).length, 3); + Assert.assertEquals(MathUtils.randomSubset(x, 4).length, 4); + Assert.assertEquals(MathUtils.randomSubset(x, 5).length, 5); + Assert.assertEquals(MathUtils.randomSubset(x, 6).length, 6); + Assert.assertEquals(MathUtils.randomSubset(x, 7).length, 7); + Assert.assertEquals(MathUtils.randomSubset(x, 8).length, 8); + Assert.assertEquals(MathUtils.randomSubset(x, 9).length, 9); + Assert.assertEquals(MathUtils.randomSubset(x, 10).length, 10); + Assert.assertEquals(MathUtils.randomSubset(x, 11).length, 10); + + for (int i = 0; i < 25; i++) + Assert.assertTrue(hasUniqueElements(MathUtils.randomSubset(x, 5))); + + } + + @Test(enabled = true) + public void testArrayShuffle() { + Integer[] x = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + for (int i = 0; i < 25; i++) { + Object[] t = MathUtils.arrayShuffle(x); + Assert.assertTrue(hasUniqueElements(t)); + Assert.assertTrue(hasAllElements(x, t)); + } + } + + private boolean hasUniqueElements(Object[] x) { + for (int i = 0; i < x.length; i++) + for (int j = i + 1; j < x.length; j++) + if (x[i].equals(x[j]) || x[i] == x[j]) + return false; + return true; + } + + private boolean hasAllElements(final Object[] expected, final Object[] actual) { + HashSet set = new HashSet(); + set.addAll(Arrays.asList(expected)); + set.removeAll(Arrays.asList(actual)); + return set.isEmpty(); + } + + + private void p (Object []x) { + for (Object v: x) + System.out.print((Integer) v + " "); + System.out.println(); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java index 18108e0a1..16b141bc3 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java @@ -112,8 +112,9 @@ public class ReadClipperTestUtils { } } - if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION && startingOp != CigarOperator.INSERTION && endingOp != CigarOperator.INSERTION) - return true; // we don't accept reads starting or ending in deletions (add any other constraint here) +// if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION && startingOp != CigarOperator.INSERTION && endingOp != CigarOperator.INSERTION) + if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION) + return true; // we don't accept reads starting or ending in deletions (add any other constraint here) } return false; @@ -190,4 +191,18 @@ public class ReadClipperTestUtils { return invertedCigar; } + /** + * Checks whether or not the read has any cigar element that is not H or S + * + * @param read + * @return true if it has any M, I or D, false otherwise + */ + public static boolean readHasNonClippedBases(GATKSAMRecord read) { + for (CigarElement cigarElement : read.getCigar().getCigarElements()) + if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) + return true; + return false; + } + + } diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java index 4dad68dc5..bc918c0a4 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java @@ -30,12 +30,12 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; +import java.util.HashMap; import java.util.List; /** @@ -59,10 +59,11 @@ public class ReadClipperUnitTest extends BaseTest { int alnStart = read.getAlignmentStart(); int alnEnd = read.getAlignmentEnd(); int readLength = alnStart - alnEnd; - for (int i=0; i= alnStart + i, String.format("Clipped alignment start is less than original read (minus %d): %s -> %s", i, read.getCigarString(), clippedRead.getCigarString())); Assert.assertTrue(clippedRead.getAlignmentEnd() <= alnEnd + i, String.format("Clipped alignment end is greater than original read (minus %d): %s -> %s", i, read.getCigarString(), clippedRead.getCigarString())); + assertUnclippedLimits(read, clippedRead); } } } @@ -72,12 +73,14 @@ public class ReadClipperUnitTest extends BaseTest { for (Cigar cigar : cigarList) { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); int readLength = read.getReadLength(); - for (int i=0; i %s", i, read.getCigarString(), clipLeft.getCigarString())); + Assert.assertTrue(clipLeft.getReadLength() <= readLength - i, String.format("Clipped read length is greater than original read length (minus %d): %s -> %s", i, read.getCigarString(), clipLeft.getCigarString())); + assertUnclippedLimits(read, clipLeft); - GATKSAMRecord clipRight = ReadClipper.hardClipByReadCoordinates(read, i, readLength-1); + GATKSAMRecord clipRight = ReadClipper.hardClipByReadCoordinates(read, i, readLength - 1); Assert.assertTrue(clipRight.getReadLength() <= i, String.format("Clipped read length is greater than original read length (minus %d): %s -> %s", i, read.getCigarString(), clipRight.getCigarString())); + assertUnclippedLimits(read, clipRight); } } } @@ -86,19 +89,27 @@ public class ReadClipperUnitTest extends BaseTest { public void testHardClipByReferenceCoordinates() { for (Cigar cigar : cigarList) { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); - int alnStart = read.getAlignmentStart(); - int alnEnd = read.getAlignmentEnd(); - for (int i=alnStart; i<=alnEnd; i++) { - if (read.getSoftStart() == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side - GATKSAMRecord clipLeft = (new ReadClipper(read)).hardClipByReferenceCoordinates(alnStart, i); - if (!clipLeft.isEmpty()) - Assert.assertTrue(clipLeft.getAlignmentStart() >= i + 1, String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString())); + int start = read.getSoftStart(); + int stop = read.getSoftEnd(); + +// System.out.println(String.format("CIGAR: %s (%d, %d)", cigar.toString(), start, stop)); + +// if (ReadUtils.readIsEntirelyInsertion(read)) +// System.out.println("debug"); + + for (int i = start; i <= stop; i++) { + GATKSAMRecord clipLeft = (new ReadClipper(read)).hardClipByReferenceCoordinates(-1, i); + if (!clipLeft.isEmpty()) { +// System.out.println(String.format("\t left [%d] %s -> %s ", i-start+1, cigar.toString(), clipLeft.getCigarString())); + Assert.assertTrue(clipLeft.getAlignmentStart() >= Math.min(read.getAlignmentEnd(), i + 1), String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString())); + assertUnclippedLimits(read, clipLeft); } - if (read.getSoftEnd() == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side - GATKSAMRecord clipRight = (new ReadClipper(read)).hardClipByReferenceCoordinates(i, alnEnd); - if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. - Assert.assertTrue(clipRight.getAlignmentEnd() <= i - 1, String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString())); + GATKSAMRecord clipRight = (new ReadClipper(read)).hardClipByReferenceCoordinates(i, -1); + if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) { // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. +// System.out.println(String.format("\t right [%d] %s -> %s ", i-start+1, cigar.toString(), clipRight.getCigarString())); + Assert.assertTrue(clipRight.getAlignmentEnd() <= Math.max(read.getAlignmentStart(), i - 1), String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString())); + assertUnclippedLimits(read, clipRight); } } } @@ -111,10 +122,14 @@ public class ReadClipperUnitTest extends BaseTest { int alnStart = read.getAlignmentStart(); int alnEnd = read.getAlignmentEnd(); if (read.getSoftStart() == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side - for (int i=alnStart; i<=alnEnd; i++) { - GATKSAMRecord clipLeft = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, i); - if (!clipLeft.isEmpty()) - Assert.assertTrue(clipLeft.getAlignmentStart() >= i + 1, String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString())); + for (int i = alnStart; i <= alnEnd; i++) { + GATKSAMRecord clipLeft = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, i); + + if (!clipLeft.isEmpty()) { +// System.out.println(String.format("Left Tail [%d]: %s (%d,%d,%d : %d,%d,%d) -> %s (%d,%d,%d : %d,%d,%d)", i, cigar.toString(), read.getUnclippedStart(), read.getSoftStart(), read.getAlignmentStart(), read.getAlignmentEnd(), read.getSoftEnd(), read.getUnclippedEnd(), clipLeft.getCigarString(), clipLeft.getUnclippedStart(), clipLeft.getSoftStart(), clipLeft.getAlignmentStart(), clipLeft.getAlignmentEnd(), clipLeft.getSoftEnd(), clipLeft.getUnclippedEnd())); + Assert.assertTrue(clipLeft.getAlignmentStart() >= i + 1, String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString())); + assertUnclippedLimits(read, clipLeft); + } } } } @@ -127,10 +142,12 @@ public class ReadClipperUnitTest extends BaseTest { int alnStart = read.getAlignmentStart(); int alnEnd = read.getAlignmentEnd(); if (read.getSoftEnd() == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side - for (int i=alnStart; i<=alnEnd; i++) { + for (int i = alnStart; i <= alnEnd; i++) { GATKSAMRecord clipRight = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, i); - if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. + if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) { // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. Assert.assertTrue(clipRight.getAlignmentEnd() <= i - 1, String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString())); + assertUnclippedLimits(read, clipRight); + } } } } @@ -145,43 +162,36 @@ public class ReadClipperUnitTest extends BaseTest { for (Cigar cigar : cigarList) { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); int readLength = read.getReadLength(); - byte [] quals = new byte[readLength]; + byte[] quals = new byte[readLength]; for (int nLowQualBases = 0; nLowQualBases < readLength; nLowQualBases++) { - - // create a read with nLowQualBases in the left tail - Utils.fillArrayWithByte(quals, HIGH_QUAL); + Utils.fillArrayWithByte(quals, HIGH_QUAL); // create a read with nLowQualBases in the left tail for (int addLeft = 0; addLeft < nLowQualBases; addLeft++) quals[addLeft] = LOW_QUAL; read.setBaseQualities(quals); GATKSAMRecord clipLeft = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL); - // Tests + assertUnclippedLimits(read, clipLeft); // Make sure limits haven't changed + assertNoLowQualBases(clipLeft, LOW_QUAL); // Make sure the low qualities are gone + Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, // Make sure only low quality bases were clipped + String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipLeft.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipLeft.getCigarString())); - // Make sure the low qualities are gone - assertNoLowQualBases(clipLeft, LOW_QUAL); - // Can't run this test with the current contract of no hanging insertions -// Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipLeft.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipLeft.getCigarString())); - - // create a read with nLowQualBases in the right tail - Utils.fillArrayWithByte(quals, HIGH_QUAL); + Utils.fillArrayWithByte(quals, HIGH_QUAL); // create a read with nLowQualBases in the right tail for (int addRight = 0; addRight < nLowQualBases; addRight++) quals[readLength - addRight - 1] = LOW_QUAL; read.setBaseQualities(quals); GATKSAMRecord clipRight = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL); - // Tests +// System.out.println(String.format("Debug [%d]: %s -> %s / %s", nLowQualBases, cigar.toString(), clipLeft.getCigarString(), clipRight.getCigarString())); - // Make sure the low qualities are gone - assertNoLowQualBases(clipRight, LOW_QUAL); + assertUnclippedLimits(read, clipRight); // Make sure limits haven't changed + assertNoLowQualBases(clipRight, LOW_QUAL); // Make sure the low qualities are gone + Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, // Make sure only low quality bases were clipped + String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipRight.getCigarString())); - // Make sure we haven't clipped any high quals -- Can't run this test with the current contract of no hanging insertions - //Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipRight.getCigarString())); - - // create a read with nLowQualBases in the both tails - if (nLowQualBases <= readLength/2) { - Utils.fillArrayWithByte(quals, HIGH_QUAL); + if (nLowQualBases <= readLength / 2) { + Utils.fillArrayWithByte(quals, HIGH_QUAL); // create a read with nLowQualBases on both tails for (int addBoth = 0; addBoth < nLowQualBases; addBoth++) { quals[addBoth] = LOW_QUAL; quals[readLength - addBoth - 1] = LOW_QUAL; @@ -189,83 +199,25 @@ public class ReadClipperUnitTest extends BaseTest { read.setBaseQualities(quals); GATKSAMRecord clipBoth = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL); - // Tests - - // Make sure the low qualities are gone - assertNoLowQualBases(clipBoth, LOW_QUAL); - - // Can't run this test with the current contract of no hanging insertions - //Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - (2*nLowQualBases), read.getCigarString(), clipBoth.getCigarString())); + assertUnclippedLimits(read, clipBoth); // Make sure limits haven't changed + assertNoLowQualBases(clipBoth, LOW_QUAL); // Make sure the low qualities are gone + Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, // Make sure only low quality bases were clipped + String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - (2 * nLowQualBases), read.getCigarString(), clipBoth.getCigarString())); } } -// logger.warn(String.format("Testing %s for all combinations of low/high qual... PASSED", read.getCigarString())); } - - // ONE OFF Testing clipping that ends inside an insertion ( Ryan's bug ) - final byte[] BASES = {'A','C','G','T','A','C','G','T'}; - final byte[] QUALS = {2, 2, 2, 2, 20, 20, 20, 2}; - final String CIGAR = "1S1M5I1S"; - - final byte[] CLIPPED_BASES = {}; - final byte[] CLIPPED_QUALS = {}; - final String CLIPPED_CIGAR = ""; - - - GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(BASES, QUALS, CIGAR); - GATKSAMRecord expected = ArtificialSAMUtils.createArtificialRead(CLIPPED_BASES, CLIPPED_QUALS, CLIPPED_CIGAR); - - ReadClipperTestUtils.assertEqualReads(ReadClipper.hardClipLowQualEnds(read, (byte) 2), expected); } @Test(enabled = true) public void testHardClipSoftClippedBases() { - - // Generate a list of cigars to test for (Cigar cigar : cigarList) { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); GATKSAMRecord clippedRead = ReadClipper.hardClipSoftClippedBases(read); + CigarCounter original = new CigarCounter(read); + CigarCounter clipped = new CigarCounter(clippedRead); - int sumHardClips = 0; - int sumMatches = 0; - - boolean tail = true; - for (CigarElement element : read.getCigar().getCigarElements()) { - // Assuming cigars are well formed, if we see S or H, it means we're on the tail (left or right) - if (element.getOperator() == CigarOperator.HARD_CLIP || element.getOperator() == CigarOperator.SOFT_CLIP) - tail = true; - - // Adds all H, S and D's (next to hard/soft clips). - // All these should be hard clips after clipping. - if (tail && (element.getOperator() == CigarOperator.HARD_CLIP || element.getOperator() == CigarOperator.SOFT_CLIP || element.getOperator() == CigarOperator.DELETION)) - sumHardClips += element.getLength(); - - // this means we're no longer on the tail (insertions can still potentially be the tail because - // of the current contract of clipping out hanging insertions - else if (element.getOperator() != CigarOperator.INSERTION) - tail = false; - - // Adds all matches to verify that they remain the same after clipping - if (element.getOperator() == CigarOperator.MATCH_OR_MISMATCH) - sumMatches += element.getLength(); - } - - for (CigarElement element : clippedRead.getCigar().getCigarElements()) { - // Test if clipped read has Soft Clips (shouldn't have any!) - Assert.assertTrue( element.getOperator() != CigarOperator.SOFT_CLIP, String.format("Cigar %s -> %s -- FAILED (resulting cigar has soft clips)", read.getCigarString(), clippedRead.getCigarString())); - - // Keep track of the total number of Hard Clips after clipping to make sure everything was accounted for - if (element.getOperator() == CigarOperator.HARD_CLIP) - sumHardClips -= element.getLength(); - - // Make sure all matches are still there - if (element.getOperator() == CigarOperator.MATCH_OR_MISMATCH) - sumMatches -= element.getLength(); - } - Assert.assertTrue( sumHardClips == 0, String.format("Cigar %s -> %s -- FAILED (number of hard clips mismatched by %d)", read.getCigarString(), clippedRead.getCigarString(), sumHardClips)); - Assert.assertTrue( sumMatches == 0, String.format("Cigar %s -> %s -- FAILED (number of matches mismatched by %d)", read.getCigarString(), clippedRead.getCigarString(), sumMatches)); - - -// logger.warn(String.format("Cigar %s -> %s -- PASSED!", read.getCigarString(), clippedRead.getCigarString())); + assertUnclippedLimits(read, clippedRead); // Make sure limits haven't changed + original.assertHardClippingSoftClips(clipped); // Make sure we have only clipped SOFT_CLIPS } } @@ -276,38 +228,39 @@ public class ReadClipperUnitTest extends BaseTest { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); GATKSAMRecord clippedRead = ReadClipper.hardClipLeadingInsertions(read); + assertUnclippedLimits(read, clippedRead); // Make sure limits haven't changed + int expectedLength = read.getReadLength() - leadingCigarElementLength(read.getCigar(), CigarOperator.INSERTION); if (cigarHasElementsDifferentThanInsertionsAndHardClips(read.getCigar())) expectedLength -= leadingCigarElementLength(ReadClipperTestUtils.invertCigar(read.getCigar()), CigarOperator.INSERTION); - if (! clippedRead.isEmpty()) { + if (!clippedRead.isEmpty()) { Assert.assertEquals(expectedLength, clippedRead.getReadLength(), String.format("%s -> %s", read.getCigarString(), clippedRead.getCigarString())); // check that everything else is still there Assert.assertFalse(startsWithInsertion(clippedRead.getCigar())); // check that the insertions are gone - } - else + } else Assert.assertTrue(expectedLength == 0, String.format("expected length: %d", expectedLength)); // check that the read was expected to be fully clipped } } } @Test(enabled = true) - public void testRevertSoftClippedBases() - { - for (Cigar cigar: cigarList) { + public void testRevertSoftClippedBases() { + for (Cigar cigar : cigarList) { final int leadingSoftClips = leadingCigarElementLength(cigar, CigarOperator.SOFT_CLIP); final int tailSoftClips = leadingCigarElementLength(ReadClipperTestUtils.invertCigar(cigar), CigarOperator.SOFT_CLIP); final GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); final GATKSAMRecord unclipped = ReadClipper.revertSoftClippedBases(read); - if ( leadingSoftClips > 0 || tailSoftClips > 0) { + assertUnclippedLimits(read, unclipped); // Make sure limits haven't changed + + if (leadingSoftClips > 0 || tailSoftClips > 0) { final int expectedStart = read.getAlignmentStart() - leadingSoftClips; final int expectedEnd = read.getAlignmentEnd() + tailSoftClips; Assert.assertEquals(unclipped.getAlignmentStart(), expectedStart); Assert.assertEquals(unclipped.getAlignmentEnd(), expectedEnd); - } - else + } else Assert.assertEquals(read.getCigarString(), unclipped.getCigarString()); } } @@ -315,12 +268,25 @@ public class ReadClipperUnitTest extends BaseTest { private void assertNoLowQualBases(GATKSAMRecord read, byte low_qual) { if (!read.isEmpty()) { - byte [] quals = read.getBaseQualities(); - for (int i=0; i 0; } @@ -335,10 +301,46 @@ public class ReadClipperUnitTest extends BaseTest { return 0; } - private boolean cigarHasElementsDifferentThanInsertionsAndHardClips (Cigar cigar) { + private boolean cigarHasElementsDifferentThanInsertionsAndHardClips(Cigar cigar) { for (CigarElement cigarElement : cigar.getCigarElements()) if (cigarElement.getOperator() != CigarOperator.INSERTION && cigarElement.getOperator() != CigarOperator.HARD_CLIP) return true; return false; } + + private class CigarCounter { + private HashMap counter; + + public Integer getCounterForOp(CigarOperator operator) { + return counter.get(operator); + } + + public CigarCounter(GATKSAMRecord read) { + CigarOperator[] operators = CigarOperator.values(); + counter = new HashMap(operators.length); + + for (CigarOperator op : operators) + counter.put(op, 0); + + for (CigarElement cigarElement : read.getCigar().getCigarElements()) + counter.put(cigarElement.getOperator(), counter.get(cigarElement.getOperator()) + cigarElement.getLength()); + } + + public boolean assertHardClippingSoftClips(CigarCounter clipped) { + for (CigarOperator op : counter.keySet()) { + if (op == CigarOperator.HARD_CLIP || op == CigarOperator.SOFT_CLIP) { + int counterTotal = counter.get(CigarOperator.HARD_CLIP) + counter.get(CigarOperator.SOFT_CLIP); + int clippedHard = clipped.getCounterForOp(CigarOperator.HARD_CLIP); + int clippedSoft = clipped.getCounterForOp(CigarOperator.SOFT_CLIP); + + Assert.assertEquals(counterTotal, clippedHard); + Assert.assertTrue(clippedSoft == 0); + } else + Assert.assertEquals(counter.get(op), clipped.getCounterForOp(op)); + } + return true; + } + + } + } \ No newline at end of file diff --git a/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.jar b/settings/repository/net.sf.snpeff/snpeff-2.0.5.jar similarity index 92% rename from settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.jar rename to settings/repository/net.sf.snpeff/snpeff-2.0.5.jar index ee5d02367..6dc922af7 100644 Binary files a/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.jar and b/settings/repository/net.sf.snpeff/snpeff-2.0.5.jar differ diff --git a/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.xml b/settings/repository/net.sf.snpeff/snpeff-2.0.5.xml similarity index 77% rename from settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.xml rename to settings/repository/net.sf.snpeff/snpeff-2.0.5.xml index 5417641d3..9a622abe5 100644 --- a/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.xml +++ b/settings/repository/net.sf.snpeff/snpeff-2.0.5.xml @@ -1,3 +1,3 @@ - +