diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java index 17e4a0743..0161c3ab2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java +++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java @@ -138,7 +138,7 @@ public class AlignmentContext implements HasGenomeLocation { * @return */ public boolean hasReads() { - return basePileup != null && basePileup.size() > 0 ; + return basePileup != null && basePileup.getNumberOfElements() > 0 ; } /** @@ -146,7 +146,7 @@ public class AlignmentContext implements HasGenomeLocation { * @return */ public int size() { - return basePileup.size(); + return basePileup.getNumberOfElements(); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java index c9506ec4c..4e75f3ddb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java @@ -92,7 +92,7 @@ public class AlignmentContextUtils { ReadBackedPileup pileupBySample = context.getPileup().getPileupForSample(sample); // Don't add empty pileups to the split context. - if(pileupBySample.size() == 0) + if(pileupBySample.getNumberOfElements() == 0) continue; if(sample != null) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java index 8152f74c2..e94d01d5a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java @@ -17,7 +17,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; */ @By(DataSource.READS) @Requires({DataSource.READS,DataSource.REFERENCE, DataSource.REFERENCE_BASES}) -@PartitionBy(PartitionType.INTERVAL) +@PartitionBy(PartitionType.LOCUS) @ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckFilter.class}) public abstract class LocusWalker extends Walker { // Do we actually want to operate on the context? diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PartitionType.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PartitionType.java index 361e222c2..f0d92ef8a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PartitionType.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PartitionType.java @@ -34,6 +34,12 @@ public enum PartitionType { */ NONE, + /** + * The walker inputs can be chunked down to individual + * reads. + */ + READ, + /** * The walker inputs can be chunked down to the * per-locus level. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java index db2038aa3..35f04abfc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java @@ -12,7 +12,7 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; * To change this template use File | Settings | File Templates. */ @Requires({DataSource.READS, DataSource.REFERENCE_BASES}) -@PartitionBy(PartitionType.CONTIG) +@PartitionBy(PartitionType.READ) public abstract class ReadWalker extends Walker { public boolean requiresOrderedReads() { return false; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java index e501258c5..e5f75f06d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java @@ -92,7 +92,7 @@ public class AlleleBalance extends InfoFieldAnnotation { continue; } // todo -- actually care about indel length from the pileup (agnostic at the moment) - int refCount = indelPileup.size(); + int refCount = indelPileup.getNumberOfElements(); int altCount = vc.isSimpleInsertion() ? indelPileup.getNumberOfInsertions() : indelPileup.getNumberOfDeletions(); if ( refCount + altCount == 0 ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AnnotationByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AnnotationByDepth.java deleted file mode 100755 index 353fd1c2c..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AnnotationByDepth.java +++ /dev/null @@ -1,32 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.utils.variantcontext.Genotype; - -import java.util.Map; - -/** - * Abstract base class for all annotations that are normalized by depth - */ -public abstract class AnnotationByDepth extends InfoFieldAnnotation { - - - protected int annotationByVariantDepth(final Map genotypes, Map stratifiedContexts) { - int depth = 0; - for ( Map.Entry genotype : genotypes.entrySet() ) { - - // we care only about variant calls - if ( genotype.getValue().isHomRef() ) - continue; - - AlignmentContext context = stratifiedContexts.get(genotype.getKey()); - if ( context != null ) - depth += context.size(); - } - - return depth; - } - - -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java index 864be55b7..ea28a28e4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java @@ -41,7 +41,7 @@ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnno int depth = 0; for ( Map.Entry sample : stratifiedContexts.entrySet() ) - depth += sample.getValue().size(); + depth += sample.getValue().getBasePileup().depthOfCoverage(); Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%d", depth)); return map; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index 552289309..7fcb56b1a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -1,10 +1,10 @@ package org.broadinstitute.sting.gatk.walkers.annotator; -import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -21,7 +21,7 @@ import java.util.Map; * * Low scores are indicative of false positive calls and artifacts. */ -public class QualByDepth extends AnnotationByDepth implements StandardAnnotation { +public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation { public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) @@ -43,14 +43,13 @@ public class QualByDepth extends AnnotationByDepth implements StandardAnnotation if ( context == null ) continue; - depth += context.size(); + depth += context.getBasePileup().depthOfCoverage(); } if ( depth == 0 ) return null; - int qDepth = annotationByVariantDepth(genotypes, stratifiedContexts); - double QD = 10.0 * vc.getNegLog10PError() / (double)qDepth; + double QD = 10.0 * vc.getNegLog10PError() / (double)depth; Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%.2f", QD)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java index 772541eb6..168fbdc49 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java @@ -79,7 +79,7 @@ public class ReadDepthAndAllelicFractionBySample extends GenotypeAnnotation { alleleCounts.put(allele.getBases()[0], 0); ReadBackedPileup pileup = stratifiedContext.getBasePileup(); - int totalDepth = pileup.size(); + int totalDepth = pileup.getNumberOfElements(); Map map = new HashMap(); map.put(getKeyNames().get(0), totalDepth); // put total depth in right away @@ -119,7 +119,7 @@ public class ReadDepthAndAllelicFractionBySample extends GenotypeAnnotation { ReadBackedExtendedEventPileup pileup = stratifiedContext.getExtendedEventPileup(); if ( pileup == null ) return null; - int totalDepth = pileup.size(); + int totalDepth = pileup.getNumberOfElements(); Map map = new HashMap(); map.put(getKeyNames().get(0), totalDepth); // put total depth in right away diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java deleted file mode 100755 index 131b87794..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java +++ /dev/null @@ -1,59 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; -import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * SB annotation value by depth of alt containing samples - */ -public class SBByDepth extends AnnotationByDepth { - - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( stratifiedContexts.size() == 0 ) - return null; - - if (!vc.hasAttribute(VCFConstants.STRAND_BIAS_KEY)) - return null; - - double sBias = vc.getAttributeAsDouble(VCFConstants.STRAND_BIAS_KEY, -1); - - final Map genotypes = vc.getGenotypes(); - if ( genotypes == null || genotypes.size() == 0 ) - return null; - - int sDepth = annotationByVariantDepth(genotypes, stratifiedContexts); - if ( sDepth == 0 ) - return null; - - - - double SbyD = (-sBias / (double)sDepth); - if (SbyD > 0) - SbyD = Math.log10(SbyD); - else - SbyD = -1000; - - Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%.2f", SbyD)); - return map; - } - - public List getKeyNames() { return Arrays.asList("SBD"); } - - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Strand Bias by Depth")); } - - - -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java index f747fbc2e..66d2ad318 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java @@ -43,7 +43,7 @@ public class SpanningDeletions extends InfoFieldAnnotation implements StandardAn if (pileup != null) { deletions += pileup.getNumberOfDeletions(); - depth += pileup.size(); + depth += pileup.getNumberOfElements(); } } Map map = new HashMap(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java index 3168d9a6b..cbbb3d43f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java @@ -113,6 +113,7 @@ import java.util.*; // todo -- allow for user to set linear binning (default is logarithmic) // todo -- formatting --> do something special for end bins in getQuantile(int[] foo), this gets mushed into the end+-1 bins for now @By(DataSource.REFERENCE) +@PartitionBy(PartitionType.INTERVAL) public class DepthOfCoverageWalker extends LocusWalker>, CoveragePartitioner> implements TreeReducible { @Output @Multiplex(value=DoCOutputMultiplexer.class,arguments={"partitionTypes","refSeqGeneList","omitDepthOutput","omitIntervals","omitSampleSummary","omitLocusTable"}) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java index 525be9cb0..666fe88a3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java @@ -278,10 +278,10 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { if ( elt.isReducedRead() ) { // reduced read representation - byte qual = elt.getReducedQual(); + byte qual = elt.getQual(); if ( BaseUtils.isRegularBase( elt.getBase() )) { - add(obsBase, qual, (byte)0, (byte)0, elt.getReducedCount()); // fast calculation of n identical likelihoods - return elt.getReducedCount(); // we added nObs bases here + add(obsBase, qual, (byte)0, (byte)0, elt.getRepresentativeCount()); // fast calculation of n identical likelihoods + return elt.getRepresentativeCount(); // we added nObs bases here } else // odd bases or deletions => don't use them return 0; } else { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 9f0585d13..e99b6af60 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -569,7 +569,7 @@ public class UnifiedGenotyperEngine { ReadBackedPileup pileup = rawContext.getBasePileup() .getMappingFilteredPileup(UAC.MIN_MAPPING_QUALTY_SCORE); // don't call when there is no coverage - if ( pileup.size() == 0 && UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ) + if ( pileup.getNumberOfElements() == 0 && UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ) return null; // stratify the AlignmentContext and cut by sample @@ -586,7 +586,7 @@ public class UnifiedGenotyperEngine { ReadBackedExtendedEventPileup pileup = rawPileup.getMappingFilteredPileup(UAC.MIN_MAPPING_QUALTY_SCORE); // don't call when there is no coverage - if ( pileup.size() == 0 && UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ) + if ( pileup.getNumberOfElements() == 0 && UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ) return null; // stratify the AlignmentContext and cut by sample @@ -602,7 +602,7 @@ public class UnifiedGenotyperEngine { for( final PileupElement p : rawContext.getBasePileup() ) { if( p.isDeletion() ) { numDeletions++; } } - if( ((double) numDeletions) / ((double) rawContext.getBasePileup().size()) > UAC.MAX_DELETION_FRACTION ) { + if( ((double) numDeletions) / ((double) rawContext.getBasePileup().getNumberOfElements()) > UAC.MAX_DELETION_FRACTION ) { return null; } } @@ -649,9 +649,9 @@ public class UnifiedGenotyperEngine { if (isCovered) { AlignmentContext context = contexts.get(sample); if (context.hasBasePileup()) - depth = context.getBasePileup().size(); + depth = context.getBasePileup().depthOfCoverage(); else if (context.hasExtendedEventPileup()) - depth = context.getExtendedEventPileup().size(); + depth = context.getExtendedEventPileup().depthOfCoverage(); } P_of_ref *= 1.0 - (theta / 2.0) * getRefBinomialProb(depth); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 0df7b7cbd..11ff99ce3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -32,17 +32,11 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; -import java.io.File; -import java.io.FileWriter; -import java.io.PrintStream; -import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.LinkedHashMap; @@ -376,8 +370,8 @@ public class PairHMMIndelErrorModel { HashMap> indelLikelihoodMap){ int numHaplotypes = haplotypeMap.size(); - final double readLikelihoods[][] = new double[pileup.size()][numHaplotypes]; - final int readCounts[] = new int[pileup.size()]; + final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][numHaplotypes]; + final int readCounts[] = new int[pileup.getNumberOfElements()]; int readIdx=0; LinkedHashMap gapOpenProbabilityMap = new LinkedHashMap(); @@ -403,8 +397,7 @@ public class PairHMMIndelErrorModel { for (PileupElement p: pileup) { // > 1 when the read is a consensus read representing multiple independent observations - final boolean isReduced = p.isReducedRead(); - readCounts[readIdx] = isReduced ? p.getReducedCount() : 1; + readCounts[readIdx] = p.getRepresentativeCount(); // check if we've already computed likelihoods for this pileup element (i.e. for this read at this location) if (indelLikelihoodMap.containsKey(p)) { @@ -607,7 +600,7 @@ public class PairHMMIndelErrorModel { if (DEBUG) { System.out.println("\nLikelihood summary"); - for (readIdx=0; readIdx < pileup.size(); readIdx++) { + for (readIdx=0; readIdx < pileup.getNumberOfElements(); readIdx++) { System.out.format("Read Index: %d ",readIdx); for (int i=0; i < readLikelihoods[readIdx].length; i++) System.out.format("L%d: %f ",i,readLikelihoods[readIdx][i]); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java index 56ce60449..424e05c20 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java @@ -228,7 +228,7 @@ public class RealignerTargetCreator extends RodWalker 0.0 && mismatchThreshold <= 1.0 && - pileup.size() >= minReadsAtLocus && + pileup.getNumberOfElements() >= minReadsAtLocus && (double)mismatchQualities / (double)totalQualities >= mismatchThreshold ) hasPointEvent = true; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java index 2c2677d82..68fbe8ce2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java @@ -258,10 +258,10 @@ public class ReadBackedPhasingWalker extends RodWalkerOutput *

- * A table delimited file containing the values of the requested fields in the VCF file + * A tab-delimited file containing the values of the requested fields in the VCF file *

* *

Examples

@@ -106,7 +106,7 @@ public class VariantsToTable extends RodWalker { /** * By default this tool only emits values for fields where the FILTER field is either PASS or . (unfiltered). - * Throwing this flag will cause $WalkerName to emit values regardless of the FILTER field value. + * Throwing this flag will cause VariantsToTable to emit values regardless of the FILTER field value. */ @Advanced @Argument(fullName="showFiltered", shortName="raw", doc="If provided, field values from filtered records will be included in the output", required=false) diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index b66198713..c1479bc69 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -5,6 +5,10 @@ import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; /** * Created by IntelliJ IDEA. @@ -174,6 +178,8 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome return new GenomeLoc[] { new GenomeLoc(getContig(),contigIndex,getStart(),splitPoint-1), new GenomeLoc(getContig(),contigIndex,splitPoint,getStop()) }; } + public GenomeLoc union( GenomeLoc that ) { return merge(that); } + @Requires("that != null") @Ensures("result != null") public GenomeLoc intersect( GenomeLoc that ) throws ReviewedStingException { @@ -192,6 +198,79 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome Math.min( getStop(), that.getStop()) ); } + @Requires("that != null") + public final List subtract( final GenomeLoc that ) { + if(GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) { + if(! GenomeLoc.isUnmapped(this) || !GenomeLoc.isUnmapped(that)) + throw new ReviewedStingException("Tried to intersect a mapped and an unmapped genome loc"); + return Arrays.asList(UNMAPPED); + } + + if (!(this.overlapsP(that))) { + throw new ReviewedStingException("GenomeLoc::minus(): The two genome loc's need to overlap"); + } + + if (equals(that)) { + return Collections.emptyList(); + } else if (containsP(that)) { + List l = new ArrayList(2); + + /** + * we have to create two new region, one for the before part, one for the after + * The old region: + * |----------------- old region (g) -------------| + * |----- to delete (e) ------| + * + * product (two new regions): + * |------| + |--------| + * + */ + int afterStop = this.getStop(), afterStart = that.getStop() + 1; + int beforeStop = that.getStart() - 1, beforeStart = this.getStart(); + if (afterStop - afterStart >= 0) { + GenomeLoc after = new GenomeLoc(this.getContig(), getContigIndex(), afterStart, afterStop); + l.add(after); + } + if (beforeStop - beforeStart >= 0) { + GenomeLoc before = new GenomeLoc(this.getContig(), getContigIndex(), beforeStart, beforeStop); + l.add(before); + } + + return l; + } else if (that.containsP(this)) { + /** + * e completely contains g, delete g, but keep looking, there may be more regions + * i.e.: + * |--------------------- e --------------------| + * |--- g ---| |---- others ----| + */ + return Collections.emptyList(); // don't need to do anything + } else { + /** + * otherwise e overlaps some part of g + * + * figure out which region occurs first on the genome. I.e., is it: + * |------------- g ----------| + * |------------- e ----------| + * + * or: + * |------------- g ----------| + * |------------ e -----------| + * + */ + + GenomeLoc n; + if (that.getStart() < this.getStart()) { + n = new GenomeLoc(this.getContig(), getContigIndex(), that.getStop() + 1, this.getStop()); + } else { + n = new GenomeLoc(this.getContig(), getContigIndex(), this.getStart(), that.getStart() - 1); + } + + // replace g with the new region + return Arrays.asList(n); + } + } + @Requires("that != null") public final boolean containsP(GenomeLoc that) { return onSameContig(that) && getStart() <= that.getStart() && getStop() >= that.getStop(); @@ -203,19 +282,14 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome } @Requires("that != null") - public final int minus( final GenomeLoc that ) { + @Ensures("result >= 0") + public final int distance( final GenomeLoc that ) { if ( this.onSameContig(that) ) - return this.getStart() - that.getStart(); + return Math.abs(this.getStart() - that.getStart()); else return Integer.MAX_VALUE; } - @Requires("that != null") - @Ensures("result >= 0") - public final int distance( final GenomeLoc that ) { - return Math.abs(minus(that)); - } - @Requires({"left != null", "right != null"}) public final boolean isBetween( final GenomeLoc left, final GenomeLoc right ) { return this.compareTo(left) > -1 && this.compareTo(right) < 1; diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java index fd7a79f48..26be0e59e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java @@ -215,7 +215,7 @@ public class GenomeLocSortedSet extends AbstractSet { if ( p.overlapsP(e) ) { toProcess.pop(); - for ( GenomeLoc newP : subtractRegion(p, e) ) + for ( GenomeLoc newP : p.subtract(e) ) toProcess.push(newP); } else if ( p.compareContigs(e) < 0 ) { good.add(toProcess.pop()); // p is now good @@ -236,69 +236,6 @@ public class GenomeLocSortedSet extends AbstractSet { return createSetFromList(genomeLocParser,good); } - private static final List EMPTY_LIST = new ArrayList(); - private List subtractRegion(GenomeLoc g, GenomeLoc e) { - if (g.equals(e)) { - return EMPTY_LIST; - } else if (g.containsP(e)) { - List l = new ArrayList(); - - /** - * we have to create two new region, one for the before part, one for the after - * The old region: - * |----------------- old region (g) -------------| - * |----- to delete (e) ------| - * - * product (two new regions): - * |------| + |--------| - * - */ - int afterStop = g.getStop(), afterStart = e.getStop() + 1; - int beforeStop = e.getStart() - 1, beforeStart = g.getStart(); - if (afterStop - afterStart >= 0) { - GenomeLoc after = genomeLocParser.createGenomeLoc(g.getContig(), afterStart, afterStop); - l.add(after); - } - if (beforeStop - beforeStart >= 0) { - GenomeLoc before = genomeLocParser.createGenomeLoc(g.getContig(), beforeStart, beforeStop); - l.add(before); - } - - return l; - } else if (e.containsP(g)) { - /** - * e completely contains g, delete g, but keep looking, there may be more regions - * i.e.: - * |--------------------- e --------------------| - * |--- g ---| |---- others ----| - */ - return EMPTY_LIST; // don't need to do anything - } else { - /** - * otherwise e overlaps some part of g - * - * figure out which region occurs first on the genome. I.e., is it: - * |------------- g ----------| - * |------------- e ----------| - * - * or: - * |------------- g ----------| - * |------------ e -----------| - * - */ - - GenomeLoc n; - if (e.getStart() < g.getStart()) { - n = genomeLocParser.createGenomeLoc(g.getContig(), e.getStop() + 1, g.getStop()); - } else { - n = genomeLocParser.createGenomeLoc(g.getContig(), g.getStart(), e.getStart() - 1); - } - - // replace g with the new region - return Arrays.asList(n); - } - } - /** * a simple removal of an interval contained in this list. The interval must be identical to one in the list (no partial locations or overlapping) diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 17e74c4f1..17f458f31 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -408,12 +408,12 @@ public class MathUtils { return Math.sqrt(rms); } - public static double rms(Collection l) { + public static double rms(Collection l) { if (l.size() == 0) return 0.0; double rms = 0.0; - for (Double i : l) + for (int i : l) rms += i*i; rms /= l.size(); return Math.sqrt(rms); diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index 372efa350..f0eb5d399 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -586,6 +586,12 @@ public class Utils { return rcbases; } + static public final List reverse(final List l) { + final List newL = new ArrayList(l); + Collections.reverse(newL); + return newL; + } + /** * Reverse an int array of bases * diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java index 83961021a..723830a91 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java @@ -113,7 +113,7 @@ public class FragmentUtils { } public final static FragmentCollection create(ReadBackedPileup rbp) { - return create(rbp, rbp.size(), PileupElementGetter); + return create(rbp, rbp.getNumberOfElements(), PileupElementGetter); } public final static FragmentCollection create(List reads) { diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index 139cded37..f0e164c87 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.utils.interval; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import net.sf.picard.util.Interval; import net.sf.picard.util.IntervalList; import net.sf.samtools.SAMFileHeader; @@ -8,6 +10,7 @@ import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -221,6 +224,44 @@ public class IntervalUtils { return GenomeLocSortedSet.createSetFromList(parser,intervals); } + /** + * computes whether the test interval list is equivalent to master. To be equivalent, test must + * contain GenomeLocs covering every base in master, exactly once. Note that this algorithm + * assumes that master genomelocs are all discontiguous (i.e., we don't have locs like 1-3 and 4-6 but + * rather just 1-6). In order to use this algorithm with contiguous genomelocs first merge them. The algorithm + * doesn't assume that test has discontinuous genomelocs. + * + * Returns a null string if there are no differences, otherwise returns a string describing the difference + * (useful for UnitTests). Assumes both lists are sorted + */ + public static final String equateIntervals(List masterArg, List testArg) { + LinkedList master = new LinkedList(masterArg); + LinkedList test = new LinkedList(testArg); + + while ( ! master.isEmpty() ) { // there's still unchecked bases in master + final GenomeLoc masterHead = master.pop(); + final GenomeLoc testHead = test.pop(); + + if ( testHead.overlapsP(masterHead) ) { + // remove the parts of test that overlap master, and push the remaining + // parts onto master for further comparison. + for ( final GenomeLoc masterPart : Utils.reverse(masterHead.subtract(testHead)) ) { + master.push(masterPart); + } + } else { + // testHead is incompatible with masterHead, so we must have extra bases in testHead + // that aren't in master + return "Incompatible locs detected masterHead=" + masterHead + ", testHead=" + testHead; + } + } + + if ( test.isEmpty() ) // everything is equal + return null; // no differences + else + return "Remaining elements found in test: first=" + test.peek(); + } + + /** * Check if string argument was intented as a file * Accepted file extensions: .bed .list, .picard, .interval_list, .intervals. @@ -384,6 +425,92 @@ public class IntervalUtils { return splitIntervalsToSubLists(locs, splitPoints); } + @Requires({"locs != null", "numParts > 0"}) + @Ensures("result != null") + public static List> splitLocusIntervals(List locs, int numParts) { + // the ideal size of each split + final long bp = IntervalUtils.intervalSize(locs); + final long idealSplitSize = Math.max((long)Math.floor(bp / (1.0*numParts)), 1); + + // algorithm: + // split = () + // set size = 0 + // pop the head H off locs. + // If size + size(H) < splitSize: + // add H to split, continue + // If size + size(H) == splitSize: + // done with split, put in splits, restart + // if size + size(H) > splitSize: + // cut H into two pieces, first of which has splitSize - size bp + // push both pieces onto locs, continue + // The last split is special -- when you have only one split left, it gets all of the remaining locs + // to deal with rounding issues + final List> splits = new ArrayList>(numParts); + + LinkedList locsLinkedList = new LinkedList(locs); + while ( ! locsLinkedList.isEmpty() ) { + if ( splits.size() + 1 == numParts ) { + // the last one gets all of the remaining parts + splits.add(new ArrayList(locsLinkedList)); + locsLinkedList.clear(); + } else { + final SplitLocusRecursive one = splitLocusIntervals1(locsLinkedList, idealSplitSize); + splits.add(one.split); + locsLinkedList = one.remaining; + } + } + + return splits; + } + + @Requires({"remaining != null", "!remaining.isEmpty()", "idealSplitSize > 0"}) + @Ensures({"result != null"}) + final static SplitLocusRecursive splitLocusIntervals1(LinkedList remaining, long idealSplitSize) { + final List split = new ArrayList(); + long size = 0; + + while ( ! remaining.isEmpty() ) { + GenomeLoc head = remaining.pop(); + final long newSize = size + head.size(); + + if ( newSize == idealSplitSize ) { + split.add(head); + break; // we are done + } else if ( newSize > idealSplitSize ) { + final long remainingBp = idealSplitSize - size; + final long cutPoint = head.getStart() + remainingBp; + GenomeLoc[] parts = head.split((int)cutPoint); + remaining.push(parts[1]); + remaining.push(parts[0]); + // when we go around, head.size' = idealSplitSize - size + // so newSize' = splitSize + head.size' = size + (idealSplitSize - size) = idealSplitSize + } else { + split.add(head); + size = newSize; + } + } + + return new SplitLocusRecursive(split, remaining); + } + + private final static class SplitLocusRecursive { + final List split; + final LinkedList remaining; + + @Requires({"split != null", "remaining != null"}) + private SplitLocusRecursive(final List split, final LinkedList remaining) { + this.split = split; + this.remaining = remaining; + } + } + + public static List flattenSplitIntervals(List> splits) { + final List locs = new ArrayList(); + for ( final List split : splits ) + locs.addAll(split); + return locs; + } + private static void addFixedSplit(List splitPoints, List locs, long locsSize, int startIndex, int stopIndex, int numParts) { if (numParts < 2) return; diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index bec27af38..4db2f974f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -45,6 +45,7 @@ public abstract class AbstractReadBackedPileup pileupElementTracker; protected int size = 0; // cached value of the size of the pileup + protected int abstractSize = -1; // cached value of the abstract size of the pileup protected int nDeletions = 0; // cached value of the number of deletions protected int nMQ0Reads = 0; // cached value of the number of MQ0 reads @@ -145,8 +146,16 @@ public abstract class AbstractReadBackedPileup pileup) { - size += pileup.size(); + size += pileup.getNumberOfElements(); + abstractSize += pileup.depthOfCoverage(); nDeletions += pileup.getNumberOfDeletions(); nMQ0Reads += pileup.getNumberOfMappingQualityZeroReads(); } @@ -574,7 +583,7 @@ public abstract class AbstractReadBackedPileup getReads() { - List reads = new ArrayList(size()); + List reads = new ArrayList(getNumberOfElements()); for ( PileupElement pile : this ) { reads.add(pile.getRead()); } return reads; } @@ -817,7 +836,7 @@ public abstract class AbstractReadBackedPileup getOffsets() { - List offsets = new ArrayList(size()); + List offsets = new ArrayList(getNumberOfElements()); for ( PileupElement pile : this ) { offsets.add(pile.getOffset()); } return offsets; } @@ -828,7 +847,7 @@ public abstract class AbstractReadBackedPileup { return ((GATKSAMRecord)read).isReducedRead(); } - public int getReducedCount() { - if ( ! isReducedRead() ) throw new IllegalArgumentException("Cannot get reduced count for non-reduced read " + getRead().getReadName()); - return ((GATKSAMRecord)read).getReducedCount(offset); + public int getRepresentativeCount() { + return isReducedRead() ? ((GATKSAMRecord)read).getReducedCount(offset) : 1; } - public byte getReducedQual() { - if ( ! isReducedRead() ) throw new IllegalArgumentException("Cannot get reduced qual for non-reduced read " + getRead().getReadName()); - return getQual(); - } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java index e7c0bc18f..4205ff5af 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java @@ -155,7 +155,7 @@ public interface ReadBackedExtendedEventPileup extends ReadBackedPileup { /** * @return the number of elements in this pileup */ - public int size(); + public int getNumberOfElements(); /** * @return the location of this pileup diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java index 21dfee8b8..8910487b8 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java @@ -133,7 +133,7 @@ public class ReadBackedExtendedEventPileupImpl extends AbstractReadBackedPileup< */ @Override public byte[] getEvents() { - byte[] v = new byte[size()]; + byte[] v = new byte[getNumberOfElements()]; int i = 0; for ( ExtendedEventPileupElement e : this.toExtendedIterable() ) { switch ( e.getType() ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java index fd04b455a..9cf62c173 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java @@ -169,9 +169,14 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca public int getNumberOfMappingQualityZeroReads(); /** - * @return the number of elements in this pileup + * @return the number of physical elements in this pileup (a reduced read is counted just once) */ - public int size(); + public int getNumberOfElements(); + + /** + * @return the number of abstract elements in this pileup (reduced reads are expanded to count all reads that they represent) + */ + public int depthOfCoverage(); /** * @return true if there are 0 elements in the pileup, false otherwise diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index bf25023d8..b8e892101 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -1,756 +1,745 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.sam; - -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.BitSet; - - -public class AlignmentUtils { - - public static class MismatchCount { - public int numMismatches = 0; - public long mismatchQualities = 0; - } - - public static long mismatchingQualities(SAMRecord r, byte[] refSeq, int refIndex) { - return getMismatchCount(r, refSeq, refIndex).mismatchQualities; - } - - public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) { - return getMismatchCount(r,refSeq,refIndex,0,r.getReadLength()); - } - - // todo -- this code and mismatchesInRefWindow should be combined and optimized into a single - // todo -- high performance implementation. We can do a lot better than this right now - public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases) { - MismatchCount mc = new MismatchCount(); - - int readIdx = 0; - int endOnRead = startOnRead + nReadBases - 1; // index of the last base on read we want to count - byte[] readSeq = r.getReadBases(); - Cigar c = r.getCigar(); - for (int i = 0 ; i < c.numCigarElements() ; i++) { - - if ( readIdx > endOnRead ) break; - - CigarElement ce = c.getCigarElement(i); - switch ( ce.getOperator() ) { - case M: - for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIdx++ ) { - if ( refIndex >= refSeq.length ) - continue; - if ( readIdx < startOnRead ) continue; - if ( readIdx > endOnRead ) break; - byte refChr = refSeq[refIndex]; - byte readChr = readSeq[readIdx]; - // Note: we need to count X/N's as mismatches because that's what SAM requires - //if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 || - // BaseUtils.simpleBaseToBaseIndex(refChr) == -1 ) - // continue; // do not count Ns/Xs/etc ? - if ( readChr != refChr ) { - mc.numMismatches++; - mc.mismatchQualities += r.getBaseQualities()[readIdx]; - } - } - break; - case I: - case S: - readIdx += ce.getLength(); - break; - case D: - case N: - refIndex += ce.getLength(); - break; - case H: - case P: - break; - default: throw new ReviewedStingException("The " + ce.getOperator() + " cigar element is not currently supported"); - } - - } - return mc; - } - - /** Returns the number of mismatches in the pileup within the given reference context. - * - * @param pileup the pileup with reads - * @param ref the reference context - * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) - * @return the number of mismatches - */ - public static int mismatchesInRefWindow(ReadBackedPileup pileup, ReferenceContext ref, boolean ignoreTargetSite) { - int mismatches = 0; - for ( PileupElement p : pileup ) - mismatches += mismatchesInRefWindow(p, ref, ignoreTargetSite); - return mismatches; - } - - /** Returns the number of mismatches in the pileup element within the given reference context. - * - * @param p the pileup element - * @param ref the reference context - * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) - * @return the number of mismatches - */ - public static int mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite) { - return mismatchesInRefWindow(p, ref, ignoreTargetSite, false); - } - - /** Returns the number of mismatches in the pileup element within the given reference context. - * - * @param p the pileup element - * @param ref the reference context - * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) - * @param qualitySumInsteadOfMismatchCount if true, return the quality score sum of the mismatches rather than the count - * @return the number of mismatches - */ - public static int mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite, boolean qualitySumInsteadOfMismatchCount) { - int sum = 0; - - int windowStart = ref.getWindow().getStart(); - int windowStop = ref.getWindow().getStop(); - byte[] refBases = ref.getBases(); - byte[] readBases = p.getRead().getReadBases(); - byte[] readQualities = p.getRead().getBaseQualities(); - Cigar c = p.getRead().getCigar(); - - int readIndex = 0; - int currentPos = p.getRead().getAlignmentStart(); - int refIndex = Math.max(0, currentPos - windowStart); - - for (int i = 0 ; i < c.numCigarElements() ; i++) { - CigarElement ce = c.getCigarElement(i); - int cigarElementLength = ce.getLength(); - switch ( ce.getOperator() ) { - case M: - for (int j = 0; j < cigarElementLength; j++, readIndex++, currentPos++) { - // are we past the ref window? - if ( currentPos > windowStop ) - break; - - // are we before the ref window? - if ( currentPos < windowStart ) - continue; - - byte refChr = refBases[refIndex++]; - - // do we need to skip the target site? - if ( ignoreTargetSite && ref.getLocus().getStart() == currentPos ) - continue; - - byte readChr = readBases[readIndex]; - if ( readChr != refChr ) - sum += (qualitySumInsteadOfMismatchCount) ? readQualities[readIndex] : 1; - } - break; - case I: - case S: - readIndex += cigarElementLength; - break; - case D: - case N: - currentPos += cigarElementLength; - if ( currentPos > windowStart ) - refIndex += Math.min(cigarElementLength, currentPos - windowStart); - break; - case H: - case P: - break; - } - } - - return sum; - } - - /** Returns the mismatches in the read within the given reference context. - * - * @param read the SAMRecord - * @param ref the reference context - * @return each base is represented by a bit in the BitSet. True for mismatch, false for ref - */ - public static BitSet mismatchesInRefWindow(SAMRecord read, ReferenceContext ref) { - // first determine the positions with mismatches - int readLength = read.getReadLength(); - BitSet mismatches = new BitSet(readLength); - - // it's possible we aren't starting at the beginning of a read, - // and we don't need to look at any of the previous context outside our window - // (although we do need future context) - int readStartPos = read.getAlignmentStart(); - int currentReadPos = read.getAlignmentStart(); - - byte[] refBases = ref.getBases(); - int refIndex = readStartPos - ref.getWindow().getStart(); - if ( refIndex < 0 ) { - throw new IllegalStateException("When calculating mismatches, we somehow don't have enough previous reference context for read " + read.getReadName() + " at position " + ref.getLocus()); - } - - byte[] readBases = read.getReadBases(); - int readIndex = 0; - - Cigar c = read.getCigar(); - - for (int i = 0 ; i < c.numCigarElements() ; i++) { - CigarElement ce = c.getCigarElement(i); - int cigarElementLength = ce.getLength(); - switch ( ce.getOperator() ) { - case M: - for (int j = 0; j < cigarElementLength; j++, readIndex++) { - // skip over unwanted bases - if ( currentReadPos++ < readStartPos ) - continue; - - // this is possible if reads extend beyond the contig end - if ( refIndex >= refBases.length ) - break; - - byte refChr = refBases[refIndex]; - byte readChr = readBases[readIndex]; - if ( readChr != refChr ) - mismatches.set(readIndex); - - refIndex++; - } - break; - case I: - case S: - readIndex += cigarElementLength; - break; - case D: - case N: - if ( currentReadPos >= readStartPos ) - refIndex += cigarElementLength; - currentReadPos += cigarElementLength; - break; - case H: - case P: - break; - } - } - return mismatches; - } - /** Returns a BitSet showing what bases are good according to the criteria of maxMismatches in the window - * - * @param read the SAMRecord - * @param ref the reference context - * @param maxMismatches the maximum number of surrounding mismatches we tolerate to consider a base good - * @param windowSize window size (on each side) to test - * @return a bitset representing which bases are good - */ - public static BitSet mismatchesInRefWindow(SAMRecord read, ReferenceContext ref, int maxMismatches, int windowSize) { - int readLength = read.getReadLength(); - - // all bits are set to false by default - BitSet result = new BitSet(readLength); - BitSet mismatches = mismatchesInRefWindow(read, ref); - - int currentPos = 0, leftPos = 0, rightPos; - int mismatchCount = 0; - - // calculate how many mismatches exist in the windows to the left/right - for ( rightPos = 1; rightPos <= windowSize && rightPos < readLength; rightPos++) { - if ( mismatches.get(rightPos) ) - mismatchCount++; - } - if ( mismatchCount <= maxMismatches ) - result.set(currentPos); - - // now, traverse over the read positions - while ( currentPos < readLength ) { - // add a new rightmost position - if ( rightPos < readLength && mismatches.get(rightPos++) ) - mismatchCount++; - // re-penalize the previous position - if ( mismatches.get(currentPos++) ) - mismatchCount++; - // don't penalize the current position - if ( mismatches.get(currentPos) ) - mismatchCount--; - // subtract the leftmost position - if ( leftPos < currentPos - windowSize && mismatches.get(leftPos++) ) - mismatchCount--; - - if ( mismatchCount <= maxMismatches ) - result.set(currentPos); - } - - return result; - } - /** Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment. - * This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but - * it only counts blocks without actually allocating and filling the list of blocks themselves. Hence, this method is - * a much more efficient alternative to r.getAlignmentBlocks.size() in the situations when this number is all that is needed. - * Formally, this method simply returns the number of M elements in the cigar. - * @param r alignment - * @return number of continuous alignment blocks (i.e. 'M' elements of the cigar; all indel and clipping elements are ignored). - */ - public static int getNumAlignmentBlocks(final SAMRecord r) { - int n = 0; - final Cigar cigar = r.getCigar(); - if (cigar == null) return 0; - - for (final CigarElement e : cigar.getCigarElements()) { - if (e.getOperator() == CigarOperator.M ) n++; - } - - return n; - } - - public static int getNumAlignedBases(final SAMRecord r) { - int n = 0; - final Cigar cigar = r.getCigar(); - if (cigar == null) return 0; - - for (final CigarElement e : cigar.getCigarElements()) { - if (e.getOperator() == CigarOperator.M ) { n += e.getLength(); } - } - - return n; - } - - public static byte[] alignmentToByteArray( final Cigar cigar, final byte[] read, final byte[] ref ) { - - final byte[] alignment = new byte[read.length]; - int refPos = 0; - int alignPos = 0; - - for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { - - final CigarElement ce = cigar.getCigarElement(iii); - final int elementLength = ce.getLength(); - - switch( ce.getOperator() ) { - case I: - case S: - for ( int jjj = 0; jjj < elementLength; jjj++ ) { - alignment[alignPos++] = '+'; - } - break; - case D: - case N: - refPos += elementLength; - break; - case M: - for ( int jjj = 0; jjj < elementLength; jjj++ ) { - alignment[alignPos] = ref[refPos]; - alignPos++; - refPos++; - } - break; - case H: - case P: - break; - default: - throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() ); - } - } - return alignment; - } - - public static int calcAlignmentByteArrayOffset( final Cigar cigar, int pileupOffset, final int alignmentStart, final int refLocus ) { - - boolean atDeletion = false; - if(pileupOffset == -1) { - atDeletion = true; - pileupOffset = refLocus - alignmentStart; - final CigarElement ce = cigar.getCigarElement(0); - if( ce.getOperator() == CigarOperator.S ) { - pileupOffset += ce.getLength(); - } - } - int pos = 0; - int alignmentPos = 0; - for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { - - final CigarElement ce = cigar.getCigarElement(iii); - final int elementLength = ce.getLength(); - - switch( ce.getOperator() ) { - case I: - case S: - pos += elementLength; - if( pos >= pileupOffset ) { - return alignmentPos; - } - break; - case D: - case N: - if(!atDeletion) { - alignmentPos += elementLength; - } else { - if( pos + elementLength - 1 >= pileupOffset ) { - return alignmentPos + (pileupOffset - pos); - } else { - pos += elementLength; - alignmentPos += elementLength; - } - } - break; - case M: - if( pos + elementLength - 1 >= pileupOffset ) { - return alignmentPos + (pileupOffset - pos); - } else { - pos += elementLength; - alignmentPos += elementLength; - } - break; - case H: - case P: - break; - default: - throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() ); - } - } - return alignmentPos; - } - - public static byte[] readToAlignmentByteArray( final Cigar cigar, final byte[] read ) { - - int alignmentLength = 0; - for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { - - final CigarElement ce = cigar.getCigarElement(iii); - final int elementLength = ce.getLength(); - - switch( ce.getOperator() ) { - case I: - case S: - break; - case D: - case N: - alignmentLength += elementLength; - break; - case M: - alignmentLength += elementLength; - break; - case H: - case P: - break; - default: - throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() ); - } - } - - final byte[] alignment = new byte[alignmentLength]; - int alignPos = 0; - int readPos = 0; - for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { - - final CigarElement ce = cigar.getCigarElement(iii); - final int elementLength = ce.getLength(); - - switch( ce.getOperator() ) { - case I: - if( alignPos > 0 ) { - if( alignment[alignPos-1] == BaseUtils.A ) { alignment[alignPos-1] = PileupElement.A_FOLLOWED_BY_INSERTION_BASE; } - else if( alignment[alignPos-1] == BaseUtils.C ) { alignment[alignPos-1] = PileupElement.C_FOLLOWED_BY_INSERTION_BASE; } - else if( alignment[alignPos-1] == BaseUtils.T ) { alignment[alignPos-1] = PileupElement.T_FOLLOWED_BY_INSERTION_BASE; } - else if( alignment[alignPos-1] == BaseUtils.G ) { alignment[alignPos-1] = PileupElement.G_FOLLOWED_BY_INSERTION_BASE; } - } - case S: - for ( int jjj = 0; jjj < elementLength; jjj++ ) { - readPos++; - } - break; - case D: - case N: - for ( int jjj = 0; jjj < elementLength; jjj++ ) { - alignment[alignPos] = PileupElement.DELETION_BASE; - alignPos++; - } - break; - case M: - for ( int jjj = 0; jjj < elementLength; jjj++ ) { - alignment[alignPos] = read[readPos]; - alignPos++; - readPos++; - } - break; - case H: - case P: - break; - default: - throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() ); - } - } - return alignment; - } - - /** - * Due to (unfortunate) multiple ways to indicate that read is unmapped allowed by SAM format - * specification, one may need this convenience shortcut. Checks both 'read unmapped' flag and - * alignment reference index/start. - * @param r record - * @return true if read is unmapped - */ - public static boolean isReadUnmapped(final SAMRecord r) { - if ( r.getReadUnmappedFlag() ) return true; - - // our life would be so much easier if all sam files followed the specs. In reality, - // sam files (including those generated by maq or bwa) miss headers altogether. When - // reading such a SAM file, reference name is set, but since there is no sequence dictionary, - // null is always returned for referenceIndex. Let's be paranoid here, and make sure that - // we do not call the read "unmapped" when it has only reference name set with ref. index missing - // or vice versa. - if ( ( r.getReferenceIndex() != null && r.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX - || r.getReferenceName() != null && !r.getReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME) ) - && r.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) return false ; - return true; - } - - /** - * Due to (unfortunate) multiple ways to indicate that read/mate is unmapped allowed by SAM format - * specification, one may need this convenience shortcut. Checks both 'mate unmapped' flag and - * alignment reference index/start of the mate. - * @param r sam record for the read - * @return true if read's mate is unmapped - */ - public static boolean isMateUnmapped(final SAMRecord r) { - if ( r.getMateUnmappedFlag() ) return true; - - // our life would be so much easier if all sam files followed the specs. In reality, - // sam files (including those generated by maq or bwa) miss headers altogether. When - // reading such a SAM file, reference name is set, but since there is no sequence dictionary, - // null is always returned for referenceIndex. Let's be paranoid here, and make sure that - // we do not call the read "unmapped" when it has only reference name set with ref. index missing - // or vice versa. - if ( ( r.getMateReferenceIndex() != null && r.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX - || r.getMateReferenceName() != null && !r.getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME) ) - && r.getMateAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) return false ; - return true; - } - - /** Returns true is read is mapped and mapped uniquely (Q>0). - * - * @param read - * @return - */ - public static boolean isReadUniquelyMapped(SAMRecord read) { - return ( ! AlignmentUtils.isReadUnmapped(read) ) && read.getMappingQuality() > 0; - } - - /** Returns the array of base qualitites in the order the bases were read on the machine (i.e. always starting from - * cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base - * qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array - * of read's base qualitites is inverted (in this case new array is allocated and returned). - * @param read - * @return - */ - public static byte [] getQualsInCycleOrder(SAMRecord read) { - if ( isReadUnmapped(read) || ! read.getReadNegativeStrandFlag() ) return read.getBaseQualities(); - - return Utils.reverse(read.getBaseQualities()); - } - - /** Returns the array of original base qualitites (before recalibration) in the order the bases were read on the machine (i.e. always starting from - * cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base - * qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array - * of read's base qualitites is inverted (in this case new array is allocated and returned). If no original base qualities - * are available this method will throw a runtime exception. - * @param read - * @return - */ - public static byte [] getOriginalQualsInCycleOrder(SAMRecord read) { - if ( isReadUnmapped(read) || ! read.getReadNegativeStrandFlag() ) return read.getOriginalBaseQualities(); - - return Utils.reverse(read.getOriginalBaseQualities()); - } - - /** Takes the alignment of the read sequence readSeq to the reference sequence refSeq - * starting at 0-based position refIndex on the refSeq and specified by its cigar. - * The last argument readIndex specifies 0-based position on the read where the alignment described by the - * cigar starts. Usually cigars specify alignments of the whole read to the ref, so that readIndex is normally 0. - * Use non-zero readIndex only when the alignment cigar represents alignment of a part of the read. The refIndex in this case - * should be the position where the alignment of that part of the read starts at. In other words, both refIndex and readIndex are - * always the positions where the cigar starts on the ref and on the read, respectively. - * - * If the alignment has an indel, then this method attempts moving this indel left across a stretch of repetitive bases. For instance, if the original cigar - * specifies that (any) one AT is deleted from a repeat sequence TATATATA, the output cigar will always mark the leftmost AT - * as deleted. If there is no indel in the original cigar, or the indel position is determined unambiguously (i.e. inserted/deleted sequence - * is not repeated), the original cigar is returned. - * @param cigar structure of the original alignment - * @param refSeq reference sequence the read is aligned to - * @param readSeq read sequence - * @param refIndex 0-based alignment start position on ref - * @param readIndex 0-based alignment start position on read - * @return a cigar, in which indel is guaranteed to be placed at the leftmost possible position across a repeat (if any) - */ - public static Cigar leftAlignIndel(Cigar cigar, final byte[] refSeq, final byte[] readSeq, final int refIndex, final int readIndex) { - - int indexOfIndel = -1; - for ( int i = 0; i < cigar.numCigarElements(); i++ ) { - CigarElement ce = cigar.getCigarElement(i); - if ( ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I ) { - // if there is more than 1 indel, don't left align - if ( indexOfIndel != -1 ) - return cigar; - indexOfIndel = i; - } - } - - // if there is no indel or if the alignment starts with an insertion (so that there - // is no place on the read to move that insertion further left), we are done - if ( indexOfIndel < 1 ) return cigar; - - final int indelLength = cigar.getCigarElement(indexOfIndel).getLength(); - - byte[] altString = createIndelString(cigar, indexOfIndel, refSeq, readSeq, refIndex, readIndex); - if ( altString == null ) - return cigar; - - Cigar newCigar = cigar; - for ( int i = 0; i < indelLength; i++ ) { - newCigar = moveCigarLeft(newCigar, indexOfIndel); - byte[] newAltString = createIndelString(newCigar, indexOfIndel, refSeq, readSeq, refIndex, readIndex); - - // check to make sure we haven't run off the end of the read - boolean reachedEndOfRead = cigarHasZeroSizeElement(newCigar); - - if ( Arrays.equals(altString, newAltString) ) { - cigar = newCigar; - i = -1; - if ( reachedEndOfRead ) - cigar = cleanUpCigar(cigar); - } - - if ( reachedEndOfRead ) - break; - } - - return cigar; - } - - private static boolean cigarHasZeroSizeElement(Cigar c) { - for ( CigarElement ce : c.getCigarElements() ) { - if ( ce.getLength() == 0 ) - return true; - } - return false; - } - - private static Cigar cleanUpCigar(Cigar c) { - ArrayList elements = new ArrayList(c.numCigarElements()-1); - for ( CigarElement ce : c.getCigarElements() ) { - if ( ce.getLength() != 0 && - (elements.size() != 0 || ce.getOperator() != CigarOperator.D) ) { - elements.add(ce); - } - } - return new Cigar(elements); - } - - private static Cigar moveCigarLeft(Cigar cigar, int indexOfIndel) { - // get the first few elements - ArrayList elements = new ArrayList(cigar.numCigarElements()); - for ( int i = 0; i < indexOfIndel - 1; i++) - elements.add(cigar.getCigarElement(i)); - - // get the indel element and move it left one base - CigarElement ce = cigar.getCigarElement(indexOfIndel-1); - elements.add(new CigarElement(ce.getLength()-1, ce.getOperator())); - elements.add(cigar.getCigarElement(indexOfIndel)); - if ( indexOfIndel+1 < cigar.numCigarElements() ) { - ce = cigar.getCigarElement(indexOfIndel+1); - elements.add(new CigarElement(ce.getLength()+1, ce.getOperator())); - } else { - elements.add(new CigarElement(1, CigarOperator.M)); - } - - // get the last few elements - for ( int i = indexOfIndel + 2; i < cigar.numCigarElements(); i++) - elements.add(cigar.getCigarElement(i)); - return new Cigar(elements); - } - - private static byte[] createIndelString(final Cigar cigar, final int indexOfIndel, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) { - CigarElement indel = cigar.getCigarElement(indexOfIndel); - int indelLength = indel.getLength(); - - int totalRefBases = 0; - for ( int i = 0; i < indexOfIndel; i++ ) { - CigarElement ce = cigar.getCigarElement(i); - int length = ce.getLength(); - - switch( ce.getOperator() ) { - case M: - readIndex += length; - refIndex += length; - totalRefBases += length; - break; - case S: - readIndex += length; - break; - case N: - refIndex += length; - totalRefBases += length; - break; - default: - break; - } - } - - // sometimes, when there are very large known indels, we won't have enough reference sequence to cover them - if ( totalRefBases + indelLength > refSeq.length ) - indelLength -= (totalRefBases + indelLength - refSeq.length); - - // the indel-based reference string - byte[] alt = new byte[refSeq.length + (indelLength * (indel.getOperator() == CigarOperator.D ? -1 : 1))]; - - // add the bases before the indel, making sure it's not aligned off the end of the reference - if ( refIndex > alt.length || refIndex > refSeq.length ) - return null; - System.arraycopy(refSeq, 0, alt, 0, refIndex); - int currentPos = refIndex; - - // take care of the indel - if ( indel.getOperator() == CigarOperator.D ) { - refIndex += indelLength; - } else { - System.arraycopy(readSeq, readIndex, alt, currentPos, indelLength); - currentPos += indelLength; - } - - // add the bases after the indel, making sure it's not aligned off the end of the reference - if ( refSeq.length - refIndex > alt.length - currentPos ) - return null; - System.arraycopy(refSeq, refIndex, alt, currentPos, refSeq.length - refIndex); - - return alt; - } -} +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; + + +public class AlignmentUtils { + + public static class MismatchCount { + public int numMismatches = 0; + public long mismatchQualities = 0; + } + + public static long mismatchingQualities(SAMRecord r, byte[] refSeq, int refIndex) { + return getMismatchCount(r, refSeq, refIndex).mismatchQualities; + } + + public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) { + return getMismatchCount(r,refSeq,refIndex,0,r.getReadLength()); + } + + // todo -- this code and mismatchesInRefWindow should be combined and optimized into a single + // todo -- high performance implementation. We can do a lot better than this right now + public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases) { + MismatchCount mc = new MismatchCount(); + + int readIdx = 0; + int endOnRead = startOnRead + nReadBases - 1; // index of the last base on read we want to count + byte[] readSeq = r.getReadBases(); + Cigar c = r.getCigar(); + for (int i = 0 ; i < c.numCigarElements() ; i++) { + + if ( readIdx > endOnRead ) break; + + CigarElement ce = c.getCigarElement(i); + switch ( ce.getOperator() ) { + case M: + for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIdx++ ) { + if ( refIndex >= refSeq.length ) + continue; + if ( readIdx < startOnRead ) continue; + if ( readIdx > endOnRead ) break; + byte refChr = refSeq[refIndex]; + byte readChr = readSeq[readIdx]; + // Note: we need to count X/N's as mismatches because that's what SAM requires + //if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 || + // BaseUtils.simpleBaseToBaseIndex(refChr) == -1 ) + // continue; // do not count Ns/Xs/etc ? + if ( readChr != refChr ) { + mc.numMismatches++; + mc.mismatchQualities += r.getBaseQualities()[readIdx]; + } + } + break; + case I: + case S: + readIdx += ce.getLength(); + break; + case D: + case N: + refIndex += ce.getLength(); + break; + case H: + case P: + break; + default: throw new ReviewedStingException("The " + ce.getOperator() + " cigar element is not currently supported"); + } + + } + return mc; + } + + /** Returns the number of mismatches in the pileup within the given reference context. + * + * @param pileup the pileup with reads + * @param ref the reference context + * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) + * @return the number of mismatches + */ + public static int mismatchesInRefWindow(ReadBackedPileup pileup, ReferenceContext ref, boolean ignoreTargetSite) { + int mismatches = 0; + for ( PileupElement p : pileup ) + mismatches += mismatchesInRefWindow(p, ref, ignoreTargetSite); + return mismatches; + } + + /** Returns the number of mismatches in the pileup element within the given reference context. + * + * @param p the pileup element + * @param ref the reference context + * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) + * @return the number of mismatches + */ + public static int mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite) { + return mismatchesInRefWindow(p, ref, ignoreTargetSite, false); + } + + /** Returns the number of mismatches in the pileup element within the given reference context. + * + * @param p the pileup element + * @param ref the reference context + * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) + * @param qualitySumInsteadOfMismatchCount if true, return the quality score sum of the mismatches rather than the count + * @return the number of mismatches + */ + public static int mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite, boolean qualitySumInsteadOfMismatchCount) { + int sum = 0; + + int windowStart = ref.getWindow().getStart(); + int windowStop = ref.getWindow().getStop(); + byte[] refBases = ref.getBases(); + byte[] readBases = p.getRead().getReadBases(); + byte[] readQualities = p.getRead().getBaseQualities(); + Cigar c = p.getRead().getCigar(); + + int readIndex = 0; + int currentPos = p.getRead().getAlignmentStart(); + int refIndex = Math.max(0, currentPos - windowStart); + + for (int i = 0 ; i < c.numCigarElements() ; i++) { + CigarElement ce = c.getCigarElement(i); + int cigarElementLength = ce.getLength(); + switch ( ce.getOperator() ) { + case M: + for (int j = 0; j < cigarElementLength; j++, readIndex++, currentPos++) { + // are we past the ref window? + if ( currentPos > windowStop ) + break; + + // are we before the ref window? + if ( currentPos < windowStart ) + continue; + + byte refChr = refBases[refIndex++]; + + // do we need to skip the target site? + if ( ignoreTargetSite && ref.getLocus().getStart() == currentPos ) + continue; + + byte readChr = readBases[readIndex]; + if ( readChr != refChr ) + sum += (qualitySumInsteadOfMismatchCount) ? readQualities[readIndex] : 1; + } + break; + case I: + case S: + readIndex += cigarElementLength; + break; + case D: + case N: + currentPos += cigarElementLength; + if ( currentPos > windowStart ) + refIndex += Math.min(cigarElementLength, currentPos - windowStart); + break; + case H: + case P: + break; + } + } + + return sum; + } + + /** Returns the number of mismatches in the pileup element within the given reference context. + * + * @param read the SAMRecord + * @param ref the reference context + * @param maxMismatches the maximum number of surrounding mismatches we tolerate to consider a base good + * @param windowSize window size (on each side) to test + * @return a bitset representing which bases are good + */ + public static BitSet mismatchesInRefWindow(SAMRecord read, ReferenceContext ref, int maxMismatches, int windowSize) { + // first determine the positions with mismatches + int readLength = read.getReadLength(); + BitSet mismatches = new BitSet(readLength); + + // it's possible we aren't starting at the beginning of a read, + // and we don't need to look at any of the previous context outside our window + // (although we do need future context) + int readStartPos = Math.max(read.getAlignmentStart(), ref.getLocus().getStart() - windowSize); + int currentReadPos = read.getAlignmentStart(); + + byte[] refBases = ref.getBases(); + int refIndex = readStartPos - ref.getWindow().getStart(); + if ( refIndex < 0 ) { + throw new IllegalStateException("When calculating mismatches, we somehow don't have enough previous reference context for read " + read.getReadName() + " at position " + ref.getLocus()); + } + + byte[] readBases = read.getReadBases(); + int readIndex = 0; + + Cigar c = read.getCigar(); + + for (int i = 0 ; i < c.numCigarElements() ; i++) { + CigarElement ce = c.getCigarElement(i); + int cigarElementLength = ce.getLength(); + switch ( ce.getOperator() ) { + case M: + for (int j = 0; j < cigarElementLength; j++, readIndex++) { + // skip over unwanted bases + if ( currentReadPos++ < readStartPos ) + continue; + + // this is possible if reads extend beyond the contig end + if ( refIndex >= refBases.length ) + break; + + byte refChr = refBases[refIndex]; + byte readChr = readBases[readIndex]; + if ( readChr != refChr ) + mismatches.set(readIndex); + + refIndex++; + } + break; + case I: + case S: + readIndex += cigarElementLength; + break; + case D: + case N: + if ( currentReadPos >= readStartPos ) + refIndex += cigarElementLength; + currentReadPos += cigarElementLength; + break; + case H: + case P: + break; + } + } + + // all bits are set to false by default + BitSet result = new BitSet(readLength); + + int currentPos = 0, leftPos = 0, rightPos; + int mismatchCount = 0; + + // calculate how many mismatches exist in the windows to the left/right + for ( rightPos = 1; rightPos <= windowSize && rightPos < readLength; rightPos++) { + if ( mismatches.get(rightPos) ) + mismatchCount++; + } + if ( mismatchCount <= maxMismatches ) + result.set(currentPos); + + // now, traverse over the read positions + while ( currentPos < readLength ) { + // add a new rightmost position + if ( rightPos < readLength && mismatches.get(rightPos++) ) + mismatchCount++; + // re-penalize the previous position + if ( mismatches.get(currentPos++) ) + mismatchCount++; + // don't penalize the current position + if ( mismatches.get(currentPos) ) + mismatchCount--; + // subtract the leftmost position + if ( leftPos < currentPos - windowSize && mismatches.get(leftPos++) ) + mismatchCount--; + + if ( mismatchCount <= maxMismatches ) + result.set(currentPos); + } + + return result; + } + /** Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment. + * This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but + * it only counts blocks without actually allocating and filling the list of blocks themselves. Hence, this method is + * a much more efficient alternative to r.getAlignmentBlocks.size() in the situations when this number is all that is needed. + * Formally, this method simply returns the number of M elements in the cigar. + * @param r alignment + * @return number of continuous alignment blocks (i.e. 'M' elements of the cigar; all indel and clipping elements are ignored). + */ + public static int getNumAlignmentBlocks(final SAMRecord r) { + int n = 0; + final Cigar cigar = r.getCigar(); + if (cigar == null) return 0; + + for (final CigarElement e : cigar.getCigarElements()) { + if (e.getOperator() == CigarOperator.M ) n++; + } + + return n; + } + + public static int getNumAlignedBases(final SAMRecord r) { + int n = 0; + final Cigar cigar = r.getCigar(); + if (cigar == null) return 0; + + for (final CigarElement e : cigar.getCigarElements()) { + if (e.getOperator() == CigarOperator.M ) { n += e.getLength(); } + } + + return n; + } + + public static byte[] alignmentToByteArray( final Cigar cigar, final byte[] read, final byte[] ref ) { + + final byte[] alignment = new byte[read.length]; + int refPos = 0; + int alignPos = 0; + + for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { + + final CigarElement ce = cigar.getCigarElement(iii); + final int elementLength = ce.getLength(); + + switch( ce.getOperator() ) { + case I: + case S: + for ( int jjj = 0; jjj < elementLength; jjj++ ) { + alignment[alignPos++] = '+'; + } + break; + case D: + case N: + refPos += elementLength; + break; + case M: + for ( int jjj = 0; jjj < elementLength; jjj++ ) { + alignment[alignPos] = ref[refPos]; + alignPos++; + refPos++; + } + break; + case H: + case P: + break; + default: + throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() ); + } + } + return alignment; + } + + public static int calcAlignmentByteArrayOffset( final Cigar cigar, int pileupOffset, final int alignmentStart, final int refLocus ) { + + boolean atDeletion = false; + if(pileupOffset == -1) { + atDeletion = true; + pileupOffset = refLocus - alignmentStart; + final CigarElement ce = cigar.getCigarElement(0); + if( ce.getOperator() == CigarOperator.S ) { + pileupOffset += ce.getLength(); + } + } + int pos = 0; + int alignmentPos = 0; + for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { + + final CigarElement ce = cigar.getCigarElement(iii); + final int elementLength = ce.getLength(); + + switch( ce.getOperator() ) { + case I: + case S: + pos += elementLength; + if( pos >= pileupOffset ) { + return alignmentPos; + } + break; + case D: + case N: + if(!atDeletion) { + alignmentPos += elementLength; + } else { + if( pos + elementLength - 1 >= pileupOffset ) { + return alignmentPos + (pileupOffset - pos); + } else { + pos += elementLength; + alignmentPos += elementLength; + } + } + break; + case M: + if( pos + elementLength - 1 >= pileupOffset ) { + return alignmentPos + (pileupOffset - pos); + } else { + pos += elementLength; + alignmentPos += elementLength; + } + break; + case H: + case P: + break; + default: + throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() ); + } + } + return alignmentPos; + } + + public static byte[] readToAlignmentByteArray( final Cigar cigar, final byte[] read ) { + + int alignmentLength = 0; + for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { + + final CigarElement ce = cigar.getCigarElement(iii); + final int elementLength = ce.getLength(); + + switch( ce.getOperator() ) { + case I: + case S: + break; + case D: + case N: + alignmentLength += elementLength; + break; + case M: + alignmentLength += elementLength; + break; + case H: + case P: + break; + default: + throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() ); + } + } + + final byte[] alignment = new byte[alignmentLength]; + int alignPos = 0; + int readPos = 0; + for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { + + final CigarElement ce = cigar.getCigarElement(iii); + final int elementLength = ce.getLength(); + + switch( ce.getOperator() ) { + case I: + if( alignPos > 0 ) { + if( alignment[alignPos-1] == BaseUtils.A ) { alignment[alignPos-1] = PileupElement.A_FOLLOWED_BY_INSERTION_BASE; } + else if( alignment[alignPos-1] == BaseUtils.C ) { alignment[alignPos-1] = PileupElement.C_FOLLOWED_BY_INSERTION_BASE; } + else if( alignment[alignPos-1] == BaseUtils.T ) { alignment[alignPos-1] = PileupElement.T_FOLLOWED_BY_INSERTION_BASE; } + else if( alignment[alignPos-1] == BaseUtils.G ) { alignment[alignPos-1] = PileupElement.G_FOLLOWED_BY_INSERTION_BASE; } + } + case S: + for ( int jjj = 0; jjj < elementLength; jjj++ ) { + readPos++; + } + break; + case D: + case N: + for ( int jjj = 0; jjj < elementLength; jjj++ ) { + alignment[alignPos] = PileupElement.DELETION_BASE; + alignPos++; + } + break; + case M: + for ( int jjj = 0; jjj < elementLength; jjj++ ) { + alignment[alignPos] = read[readPos]; + alignPos++; + readPos++; + } + break; + case H: + case P: + break; + default: + throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() ); + } + } + return alignment; + } + + /** + * Due to (unfortunate) multiple ways to indicate that read is unmapped allowed by SAM format + * specification, one may need this convenience shortcut. Checks both 'read unmapped' flag and + * alignment reference index/start. + * @param r record + * @return true if read is unmapped + */ + public static boolean isReadUnmapped(final SAMRecord r) { + if ( r.getReadUnmappedFlag() ) return true; + + // our life would be so much easier if all sam files followed the specs. In reality, + // sam files (including those generated by maq or bwa) miss headers altogether. When + // reading such a SAM file, reference name is set, but since there is no sequence dictionary, + // null is always returned for referenceIndex. Let's be paranoid here, and make sure that + // we do not call the read "unmapped" when it has only reference name set with ref. index missing + // or vice versa. + if ( ( r.getReferenceIndex() != null && r.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX + || r.getReferenceName() != null && !r.getReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME) ) + && r.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) return false ; + return true; + } + + /** + * Due to (unfortunate) multiple ways to indicate that read/mate is unmapped allowed by SAM format + * specification, one may need this convenience shortcut. Checks both 'mate unmapped' flag and + * alignment reference index/start of the mate. + * @param r sam record for the read + * @return true if read's mate is unmapped + */ + public static boolean isMateUnmapped(final SAMRecord r) { + if ( r.getMateUnmappedFlag() ) return true; + + // our life would be so much easier if all sam files followed the specs. In reality, + // sam files (including those generated by maq or bwa) miss headers altogether. When + // reading such a SAM file, reference name is set, but since there is no sequence dictionary, + // null is always returned for referenceIndex. Let's be paranoid here, and make sure that + // we do not call the read "unmapped" when it has only reference name set with ref. index missing + // or vice versa. + if ( ( r.getMateReferenceIndex() != null && r.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX + || r.getMateReferenceName() != null && !r.getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME) ) + && r.getMateAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) return false ; + return true; + } + + /** Returns true is read is mapped and mapped uniquely (Q>0). + * + * @param read + * @return + */ + public static boolean isReadUniquelyMapped(SAMRecord read) { + return ( ! AlignmentUtils.isReadUnmapped(read) ) && read.getMappingQuality() > 0; + } + + /** Returns the array of base qualitites in the order the bases were read on the machine (i.e. always starting from + * cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base + * qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array + * of read's base qualitites is inverted (in this case new array is allocated and returned). + * @param read + * @return + */ + public static byte [] getQualsInCycleOrder(SAMRecord read) { + if ( isReadUnmapped(read) || ! read.getReadNegativeStrandFlag() ) return read.getBaseQualities(); + + return Utils.reverse(read.getBaseQualities()); + } + + /** Returns the array of original base qualitites (before recalibration) in the order the bases were read on the machine (i.e. always starting from + * cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base + * qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array + * of read's base qualitites is inverted (in this case new array is allocated and returned). If no original base qualities + * are available this method will throw a runtime exception. + * @param read + * @return + */ + public static byte [] getOriginalQualsInCycleOrder(SAMRecord read) { + if ( isReadUnmapped(read) || ! read.getReadNegativeStrandFlag() ) return read.getOriginalBaseQualities(); + + return Utils.reverse(read.getOriginalBaseQualities()); + } + + /** Takes the alignment of the read sequence readSeq to the reference sequence refSeq + * starting at 0-based position refIndex on the refSeq and specified by its cigar. + * The last argument readIndex specifies 0-based position on the read where the alignment described by the + * cigar starts. Usually cigars specify alignments of the whole read to the ref, so that readIndex is normally 0. + * Use non-zero readIndex only when the alignment cigar represents alignment of a part of the read. The refIndex in this case + * should be the position where the alignment of that part of the read starts at. In other words, both refIndex and readIndex are + * always the positions where the cigar starts on the ref and on the read, respectively. + * + * If the alignment has an indel, then this method attempts moving this indel left across a stretch of repetitive bases. For instance, if the original cigar + * specifies that (any) one AT is deleted from a repeat sequence TATATATA, the output cigar will always mark the leftmost AT + * as deleted. If there is no indel in the original cigar, or the indel position is determined unambiguously (i.e. inserted/deleted sequence + * is not repeated), the original cigar is returned. + * @param cigar structure of the original alignment + * @param refSeq reference sequence the read is aligned to + * @param readSeq read sequence + * @param refIndex 0-based alignment start position on ref + * @param readIndex 0-based alignment start position on read + * @return a cigar, in which indel is guaranteed to be placed at the leftmost possible position across a repeat (if any) + */ + public static Cigar leftAlignIndel(Cigar cigar, final byte[] refSeq, final byte[] readSeq, final int refIndex, final int readIndex) { + + int indexOfIndel = -1; + for ( int i = 0; i < cigar.numCigarElements(); i++ ) { + CigarElement ce = cigar.getCigarElement(i); + if ( ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I ) { + // if there is more than 1 indel, don't left align + if ( indexOfIndel != -1 ) + return cigar; + indexOfIndel = i; + } + } + + // if there is no indel or if the alignment starts with an insertion (so that there + // is no place on the read to move that insertion further left), we are done + if ( indexOfIndel < 1 ) return cigar; + + final int indelLength = cigar.getCigarElement(indexOfIndel).getLength(); + + byte[] altString = createIndelString(cigar, indexOfIndel, refSeq, readSeq, refIndex, readIndex); + if ( altString == null ) + return cigar; + + Cigar newCigar = cigar; + for ( int i = 0; i < indelLength; i++ ) { + newCigar = moveCigarLeft(newCigar, indexOfIndel); + byte[] newAltString = createIndelString(newCigar, indexOfIndel, refSeq, readSeq, refIndex, readIndex); + + // check to make sure we haven't run off the end of the read + boolean reachedEndOfRead = cigarHasZeroSizeElement(newCigar); + + if ( Arrays.equals(altString, newAltString) ) { + cigar = newCigar; + i = -1; + if ( reachedEndOfRead ) + cigar = cleanUpCigar(cigar); + } + + if ( reachedEndOfRead ) + break; + } + + return cigar; + } + + private static boolean cigarHasZeroSizeElement(Cigar c) { + for ( CigarElement ce : c.getCigarElements() ) { + if ( ce.getLength() == 0 ) + return true; + } + return false; + } + + private static Cigar cleanUpCigar(Cigar c) { + ArrayList elements = new ArrayList(c.numCigarElements()-1); + for ( CigarElement ce : c.getCigarElements() ) { + if ( ce.getLength() != 0 && + (elements.size() != 0 || ce.getOperator() != CigarOperator.D) ) { + elements.add(ce); + } + } + return new Cigar(elements); + } + + private static Cigar moveCigarLeft(Cigar cigar, int indexOfIndel) { + // get the first few elements + ArrayList elements = new ArrayList(cigar.numCigarElements()); + for ( int i = 0; i < indexOfIndel - 1; i++) + elements.add(cigar.getCigarElement(i)); + + // get the indel element and move it left one base + CigarElement ce = cigar.getCigarElement(indexOfIndel-1); + elements.add(new CigarElement(ce.getLength()-1, ce.getOperator())); + elements.add(cigar.getCigarElement(indexOfIndel)); + if ( indexOfIndel+1 < cigar.numCigarElements() ) { + ce = cigar.getCigarElement(indexOfIndel+1); + elements.add(new CigarElement(ce.getLength()+1, ce.getOperator())); + } else { + elements.add(new CigarElement(1, CigarOperator.M)); + } + + // get the last few elements + for ( int i = indexOfIndel + 2; i < cigar.numCigarElements(); i++) + elements.add(cigar.getCigarElement(i)); + return new Cigar(elements); + } + + private static byte[] createIndelString(final Cigar cigar, final int indexOfIndel, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) { + CigarElement indel = cigar.getCigarElement(indexOfIndel); + int indelLength = indel.getLength(); + + int totalRefBases = 0; + for ( int i = 0; i < indexOfIndel; i++ ) { + CigarElement ce = cigar.getCigarElement(i); + int length = ce.getLength(); + + switch( ce.getOperator() ) { + case M: + readIndex += length; + refIndex += length; + totalRefBases += length; + break; + case S: + readIndex += length; + break; + case N: + refIndex += length; + totalRefBases += length; + break; + default: + break; + } + } + + // sometimes, when there are very large known indels, we won't have enough reference sequence to cover them + if ( totalRefBases + indelLength > refSeq.length ) + indelLength -= (totalRefBases + indelLength - refSeq.length); + + // the indel-based reference string + byte[] alt = new byte[refSeq.length + (indelLength * (indel.getOperator() == CigarOperator.D ? -1 : 1))]; + + // add the bases before the indel, making sure it's not aligned off the end of the reference + if ( refIndex > alt.length || refIndex > refSeq.length ) + return null; + System.arraycopy(refSeq, 0, alt, 0, refIndex); + int currentPos = refIndex; + + // take care of the indel + if ( indel.getOperator() == CigarOperator.D ) { + refIndex += indelLength; + } else { + System.arraycopy(readSeq, readIndex, alt, currentPos, indelLength); + currentPos += indelLength; + } + + // add the bases after the indel, making sure it's not aligned off the end of the reference + if ( refSeq.length - refIndex > alt.length - currentPos ) + return null; + System.arraycopy(refSeq, refIndex, alt, currentPos, refSeq.length - refIndex); + + return alt; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index d6c0b68b8..ede75817a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -43,6 +43,7 @@ import java.util.Map; * */ public class GATKSAMRecord extends BAMRecord { + public static final String REDUCED_READ_QUALITY_TAG = "RR"; // the SAMRecord data we're caching private String mReadString = null; private GATKSAMReadGroupRecord mReadGroup = null; @@ -151,7 +152,7 @@ public class GATKSAMRecord extends BAMRecord { public byte[] getReducedReadCounts() { if ( ! retrievedReduceReadCounts ) { - reducedReadCounts = getByteArrayAttribute(ReadUtils.REDUCED_READ_QUALITY_TAG); + reducedReadCounts = getByteArrayAttribute(REDUCED_READ_QUALITY_TAG); retrievedReduceReadCounts = true; } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index da047a3a0..9372d6478 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -51,8 +51,6 @@ public class ReadUtils { // // ---------------------------------------------------------------------------------------------------- - public static final String REDUCED_READ_QUALITY_TAG = "RQ"; - // ---------------------------------------------------------------------------------------------------- // // General utilities diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index 297a8501a..97d01bf0a 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -78,7 +78,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { ReadBackedExtendedEventPileup pileup = context.getExtendedEventPileup().getBaseFilteredPileup(10); Assert.assertEquals(pileup.getLocation().getStart(), 5, "Extended event pileup at wrong location"); - Assert.assertEquals(pileup.size(), 3, "Pileup size is incorrect"); + Assert.assertEquals(pileup.getNumberOfElements(), 3, "Pileup size is incorrect"); foundExtendedEventPileup = true; } diff --git a/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java index 59a6ecb8d..4471202c7 100755 --- a/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java @@ -29,7 +29,7 @@ public class ReadUtilsUnitTest extends BaseTest { reducedRead = ArtificialSAMUtils.createArtificialRead(header, "reducedRead", 0, 1, BASES.length()); reducedRead.setReadBases(BASES.getBytes()); reducedRead.setBaseQualityString(QUALS); - reducedRead.setAttribute(ReadUtils.REDUCED_READ_QUALITY_TAG, REDUCED_READ_COUNTS); + reducedRead.setAttribute(GATKSAMRecord.REDUCED_READ_QUALITY_TAG, REDUCED_READ_COUNTS); } private void testReadBasesAndQuals(SAMRecord read, int expectedStart, int expectedStop) { @@ -65,7 +65,7 @@ public class ReadUtilsUnitTest extends BaseTest { Assert.assertFalse(readp.isReducedRead()); Assert.assertTrue(reducedreadp.isReducedRead()); - Assert.assertEquals(reducedreadp.getReducedCount(), REDUCED_READ_COUNTS[0]); - Assert.assertEquals(reducedreadp.getReducedQual(), readp.getQual()); + Assert.assertEquals(reducedreadp.getRepresentativeCount(), REDUCED_READ_COUNTS[0]); + Assert.assertEquals(reducedreadp.getQual(), readp.getQual()); } } diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index 067ec14a0..9c3b905c2 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.utils.interval; import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.picard.util.IntervalUtil; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; @@ -131,6 +132,135 @@ public class IntervalUtilsUnitTest extends BaseTest { Assert.assertEquals(totalSize, sumOfSplitSizes, "Split intervals don't contain the exact number of bases in the origianl intervals"); } + // ------------------------------------------------------------------------------------- + // + // splitLocusIntervals tests + // + // ------------------------------------------------------------------------------------- + + /** large scale tests for many intervals */ + private class SplitLocusIntervalsTest extends TestDataProvider { + final List originalIntervals; + final public int parts; + + private SplitLocusIntervalsTest(final String name, List originalIntervals, final int parts) { + super(SplitLocusIntervalsTest.class, name); + this.parts = parts; + this.originalIntervals = originalIntervals; + } + + public String toString() { + return String.format("%s parts=%d", super.toString(), parts); + } + } + + @DataProvider(name = "IntervalRepartitionTest") + public Object[][] createIntervalRepartitionTest() { + for ( int parts : Arrays.asList(1, 2, 3, 10, 13, 100, 151, 1000, 10000) ) { + //for ( int parts : Arrays.asList(10) ) { + new SplitLocusIntervalsTest("hg19RefLocs", hg19ReferenceLocs, parts); + new SplitLocusIntervalsTest("hg19ExomeLocs", hg19exomeIntervals, parts); + } + + return SplitLocusIntervalsTest.getTests(SplitLocusIntervalsTest.class); + } + + @Test(enabled = true, dataProvider = "IntervalRepartitionTest") + public void testIntervalRepartition(SplitLocusIntervalsTest test) { + List> splitByLocus = IntervalUtils.splitLocusIntervals(test.originalIntervals, test.parts); + Assert.assertEquals(splitByLocus.size(), test.parts, "SplitLocusIntervals failed to generate correct number of intervals"); + List flat = IntervalUtils.flattenSplitIntervals(splitByLocus); + + // test overall size + final long originalSize = IntervalUtils.intervalSize(test.originalIntervals); + final long flatSize = IntervalUtils.intervalSize(flat); + Assert.assertEquals(flatSize, originalSize, "SplitLocusIntervals locs cover an incorrect number of bases"); + + // test size of each split + final long ideal = (long)Math.floor(originalSize / (1.0 * test.parts)); + final long maxSize = ideal + (originalSize % test.parts) * test.parts; // no more than N * rounding error in size + for ( final List split : splitByLocus ) { + final long splitSize = IntervalUtils.intervalSize(split); + Assert.assertTrue(splitSize >= ideal && splitSize <= maxSize, + String.format("SplitLocusIntervals interval (start=%s) has size %d outside of bounds ideal=%d, max=%d", + split.get(0), splitSize, ideal, maxSize)); + } + + // test that every base in original is covered once by a base in split by locus intervals + String diff = IntervalUtils.equateIntervals(test.originalIntervals, flat); + Assert.assertNull(diff, diff); + } + + /** small scale tests where the expected cuts are enumerated upfront for testing */ + private class SplitLocusIntervalsSmallTest extends TestDataProvider { + final List original; + final public int parts; + final public int expectedParts; + final List expected; + + private SplitLocusIntervalsSmallTest(final String name, List originalIntervals, final int parts, List expected) { + this(name, originalIntervals, parts, expected, parts); + } + + private SplitLocusIntervalsSmallTest(final String name, List originalIntervals, final int parts, List expected, int expectedParts) { + super(SplitLocusIntervalsSmallTest.class, name); + this.parts = parts; + this.expectedParts = expectedParts; + this.original = originalIntervals; + this.expected = expected; + } + + public String toString() { + return String.format("%s parts=%d", super.toString(), parts); + } + } + + @DataProvider(name = "SplitLocusIntervalsSmallTest") + public Object[][] createSplitLocusIntervalsSmallTest() { + GenomeLoc bp01_10 = hg19GenomeLocParser.createGenomeLoc("1", 1, 10); + + GenomeLoc bp1_5 = hg19GenomeLocParser.createGenomeLoc("1", 1, 5); + GenomeLoc bp6_10 = hg19GenomeLocParser.createGenomeLoc("1", 6, 10); + new SplitLocusIntervalsSmallTest("cut into two", Arrays.asList(bp01_10), 2, Arrays.asList(bp1_5, bp6_10)); + + GenomeLoc bp20_30 = hg19GenomeLocParser.createGenomeLoc("1", 20, 30); + new SplitLocusIntervalsSmallTest("two in two", Arrays.asList(bp01_10, bp20_30), 2, Arrays.asList(bp01_10, bp20_30)); + + GenomeLoc bp1_7 = hg19GenomeLocParser.createGenomeLoc("1", 1, 7); + GenomeLoc bp8_10 = hg19GenomeLocParser.createGenomeLoc("1", 8, 10); + GenomeLoc bp20_23 = hg19GenomeLocParser.createGenomeLoc("1", 20, 23); + GenomeLoc bp24_30 = hg19GenomeLocParser.createGenomeLoc("1", 24, 30); + new SplitLocusIntervalsSmallTest("two in three", Arrays.asList(bp01_10, bp20_30), 3, + Arrays.asList(bp1_7, bp8_10, bp20_23, bp24_30)); + + GenomeLoc bp1_2 = hg19GenomeLocParser.createGenomeLoc("1", 1, 2); + GenomeLoc bp1_1 = hg19GenomeLocParser.createGenomeLoc("1", 1, 1); + GenomeLoc bp2_2 = hg19GenomeLocParser.createGenomeLoc("1", 2, 2); + new SplitLocusIntervalsSmallTest("too many pieces", Arrays.asList(bp1_2), 5, Arrays.asList(bp1_1, bp2_2), 2); + + new SplitLocusIntervalsSmallTest("emptyList", Collections.emptyList(), 5, Collections.emptyList(), 0); + + return SplitLocusIntervalsSmallTest.getTests(SplitLocusIntervalsSmallTest.class); + } + + @Test(enabled = true, dataProvider = "SplitLocusIntervalsSmallTest") + public void splitLocusIntervalsSmallTest(SplitLocusIntervalsSmallTest test) { + List> splitByLocus = IntervalUtils.splitLocusIntervals(test.original, test.parts); + Assert.assertEquals(splitByLocus.size(), test.expectedParts, "SplitLocusIntervals failed to generate correct number of intervals"); + List flat = IntervalUtils.flattenSplitIntervals(splitByLocus); + + // test sizes + final long originalSize = IntervalUtils.intervalSize(test.original); + final long splitSize = IntervalUtils.intervalSize(flat); + Assert.assertEquals(splitSize, originalSize, "SplitLocusIntervals locs cover an incorrect number of bases"); + + Assert.assertEquals(flat, test.expected, "SplitLocusIntervals locs not expected intervals"); + } + + // + // Misc. tests + // + @Test(expectedExceptions=UserException.class) public void testMergeListsBySetOperatorNoOverlap() { // a couple of lists we'll use for the testing diff --git a/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java index b07da7cc8..564c987e5 100644 --- a/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java @@ -102,7 +102,7 @@ public class ReadBackedPileupUnitTest { ReadBackedPileup nullRgPileup = pileup.getPileupForReadGroup(null); List nullRgReads = nullRgPileup.getReads(); - Assert.assertEquals(nullRgPileup.size(), 3, "Wrong number of reads in null read group"); + Assert.assertEquals(nullRgPileup.getNumberOfElements(), 3, "Wrong number of reads in null read group"); Assert.assertEquals(nullRgReads.get(0), read1, "Read " + read1.getReadName() + " should be in null rg but isn't"); Assert.assertEquals(nullRgReads.get(1), read2, "Read " + read2.getReadName() + " should be in null rg but isn't"); Assert.assertEquals(nullRgReads.get(2), read3, "Read " + read3.getReadName() + " should be in null rg but isn't"); @@ -187,7 +187,7 @@ public class ReadBackedPileupUnitTest { ReadBackedPileup pileup = new ReadBackedPileupImpl(null,sampleToPileupMap); ReadBackedPileup sample2Pileup = pileup.getPileupForSample(sample2); - Assert.assertEquals(sample2Pileup.size(),1,"Sample 2 pileup has wrong number of elements"); + Assert.assertEquals(sample2Pileup.getNumberOfElements(),1,"Sample 2 pileup has wrong number of elements"); Assert.assertEquals(sample2Pileup.getReads().get(0),read2,"Sample 2 pileup has incorrect read"); ReadBackedPileup missingSamplePileup = pileup.getPileupForSample("missing"); diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala index ee4cb9c4a..2609c3607 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala @@ -35,6 +35,8 @@ class ContigScatterFunction extends GATKScatterFunction with InProcessFunction { // Include unmapped reads by default. this.includeUnmapped = true + override def scatterCount = if (intervalFilesExist) super.scatterCount min this.maxIntervals else super.scatterCount + protected override def maxIntervals = { GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals).contigs.size } @@ -44,3 +46,4 @@ class ContigScatterFunction extends GATKScatterFunction with InProcessFunction { IntervalUtils.scatterContigIntervals(gi.samFileHeader, gi.locs, this.scatterOutputFiles) } } + diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala index c89e63668..24c2831b6 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala @@ -53,9 +53,6 @@ trait GATKScatterFunction extends ScatterFunction { /** Whether the last scatter job should also include any unmapped reads. */ protected var includeUnmapped: Boolean = _ - /** The total number of clone jobs that will be created. */ - override def scatterCount = if (intervalFilesExist) super.scatterCount min this.maxIntervals else super.scatterCount - override def init() { this.originalGATK = this.originalFunction.asInstanceOf[CommandLineGATK] this.referenceSequence = this.originalGATK.reference_sequence diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala index f65d5ab29..40a6fc4b4 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala @@ -35,6 +35,8 @@ class IntervalScatterFunction extends GATKScatterFunction with InProcessFunction protected override def maxIntervals = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals).locs.size + override def scatterCount = if (intervalFilesExist) super.scatterCount min this.maxIntervals else super.scatterCount + def run() { val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) val splits = IntervalUtils.splitFixedIntervals(gi.locs, this.scatterOutputFiles.size) diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala index 50482033f..8f52b9b82 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala @@ -24,8 +24,21 @@ package org.broadinstitute.sting.queue.extensions.gatk +import collection.JavaConversions._ +import org.broadinstitute.sting.utils.interval.IntervalUtils +import org.broadinstitute.sting.queue.function.InProcessFunction + /** - * For now returns an IntervalScatterFunction. - * TODO: A scatter function that divides down to the locus level. + * A scatter function that divides down to the locus level. */ -class LocusScatterFunction extends IntervalScatterFunction {} +//class LocusScatterFunction extends IntervalScatterFunction { } + +class LocusScatterFunction extends GATKScatterFunction with InProcessFunction { + protected override def maxIntervals = scatterCount + + def run() { + val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) + val splits = IntervalUtils.splitLocusIntervals(gi.locs, this.scatterOutputFiles.size) + IntervalUtils.scatterFixedIntervals(gi.samFileHeader, splits, this.scatterOutputFiles) + } +} \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ReadScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ReadScatterFunction.scala new file mode 100644 index 000000000..f56c4aa02 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ReadScatterFunction.scala @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.extensions.gatk + +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Currently ReadScatterFunction only does ContigScattering, but it + * could in principle do something more aggressive. + */ +class ReadScatterFunction extends ContigScatterFunction { } +