diff --git a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java index a7bb58d0c..ba1da7c87 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java @@ -84,12 +84,13 @@ public class AlleleBiasedDownsamplingUtils { // start by stratifying the reads by the alleles they represent at this position for( final PileupElement pe : pileup ) { // we do not want to remove a reduced read - if ( pe.getRead().isReducedRead() ) + if ( pe.getRead().isReducedRead() ) { reducedReadPileups.add(pe); - - final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); - if ( baseIndex != -1 ) - alleleStratifiedElements[baseIndex].add(pe); + } else { + final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); + if ( baseIndex != -1 ) + alleleStratifiedElements[baseIndex].add(pe); + } } // Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later. diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java index aeec36c18..4adb2ca71 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java @@ -99,7 +99,7 @@ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnno for (PerReadAlleleLikelihoodMap maps : perReadAlleleLikelihoodMap.values() ) { for (Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { final GATKSAMRecord read = el.getKey(); - depth += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1); + depth += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1); } } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index a194fe323..5acea12f6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -144,7 +144,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa continue; // read is non-informative if (!vc.getAlleles().contains(a)) continue; // sanity check - shouldn't be needed - alleleCounts.put(a, alleleCounts.get(a) + (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1)); + alleleCounts.put(a, alleleCounts.get(a) + (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1)); } final int[] counts = new int[alleleCounts.size()]; counts[0] = alleleCounts.get(vc.getReference()); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index 167e5df63..ff3d7940f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -116,8 +116,8 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat else if (table1 == null) return annotationForOneTable(pValueForContingencyTable(table2)); else { // take the one with the best (i.e., least significant pvalue) - double pvalue1 = Math.max(pValueForContingencyTable(table1), MIN_PVALUE); - double pvalue2 = Math.max(pValueForContingencyTable(table2), MIN_PVALUE); + double pvalue1 = pValueForContingencyTable(table1); + double pvalue2 = pValueForContingencyTable(table2); return annotationForOneTable(Math.max(pvalue1, pvalue2)); } } @@ -129,7 +129,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat * @return a hash map from FS -> phred-scaled pValue */ private Map annotationForOneTable(final double pValue) { - final Object value = String.format("%.3f", QualityUtils.phredScaleErrorRate(pValue)); + final Object value = String.format("%.3f", QualityUtils.phredScaleErrorRate(Math.max(pValue, MIN_PVALUE))); // prevent INFINITYs return Collections.singletonMap(FS, value); // Map map = new HashMap(); // map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pValue))); @@ -265,24 +265,16 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat for (PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) { for (Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { - final boolean matchesRef = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(ref,true); - final boolean matchesAlt = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(alt,true); - - if ( !matchesRef && !matchesAlt ) - continue; - - boolean isFW = el.getKey().getReadNegativeStrandFlag(); - - int row = matchesRef ? 0 : 1; - int column = isFW ? 0 : 1; - + final Allele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); final GATKSAMRecord read = el.getKey(); - table[row][column] += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1); + final int representativeCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1; + updateTable(table, mostLikelyAllele, read, ref, alt, representativeCount); } } return table; } + /** Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: * fw rc @@ -299,31 +291,36 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat for ( Map.Entry sample : stratifiedContexts.entrySet() ) { for (PileupElement p : sample.getValue().getBasePileup()) { - // ignore reduced reads because they are always on the forward strand! - // TODO -- when het compression is enabled in RR, we somehow need to allow those reads through into the Fisher test - if ( p.getRead().isReducedRead() ) - continue; - if ( ! RankSumTest.isUsableBase(p, false) ) // ignore deletions continue; if ( p.getQual() < minQScoreToConsider || p.getMappingQual() < minQScoreToConsider ) continue; - final Allele base = Allele.create(p.getBase(), false); - final boolean isFW = !p.getRead().getReadNegativeStrandFlag(); - - final boolean matchesRef = ref.equals(base, true); - final boolean matchesAlt = alt.equals(base, true); - if ( matchesRef || matchesAlt ) { - int row = matchesRef ? 0 : 1; - int column = isFW ? 0 : 1; - - table[row][column] += p.getRepresentativeCount(); - } + updateTable(table, Allele.create(p.getBase(), false), p.getRead(), ref, alt, p.getRepresentativeCount()); } } return table; } + + private static void updateTable(final int[][] table, final Allele allele, final GATKSAMRecord read, final Allele ref, final Allele alt, final int representativeCount) { + // ignore reduced reads because they are always on the forward strand! + // TODO -- when het compression is enabled in RR, we somehow need to allow those reads through into the Fisher test + if ( read.isReducedRead() ) + return; + + final boolean matchesRef = allele.equals(ref, true); + final boolean matchesAlt = allele.equals(alt, true); + + if ( matchesRef || matchesAlt ) { + + final boolean isFW = !read.getReadNegativeStrandFlag(); + + int row = matchesRef ? 0 : 1; + int column = isFW ? 0 : 1; + + table[row][column] += representativeCount; + } + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java index 3bb3d7d5a..2b3290595 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java @@ -95,9 +95,9 @@ public class GCContent extends InfoFieldAnnotation implements ExperimentalAnnota for ( byte base : ref.getBases() ) { int baseIndex = BaseUtils.simpleBaseToBaseIndex(base); - if ( baseIndex == BaseUtils.gIndex || baseIndex == BaseUtils.cIndex ) + if ( baseIndex == BaseUtils.Base.G.ordinal() || baseIndex == BaseUtils.Base.C.ordinal() ) gc++; - else if ( baseIndex == BaseUtils.aIndex || baseIndex == BaseUtils.tIndex ) + else if ( baseIndex == BaseUtils.Base.A.ordinal() || baseIndex == BaseUtils.Base.T.ordinal() ) at++; else ; // ignore diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index fe4075117..3acba48ae 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -56,7 +56,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnot import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.variant.utils.BaseUtils; -import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.variant.vcf.VCFHeaderLineType; @@ -236,8 +235,8 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot final byte[] haplotypeBases = new byte[contextSize]; Arrays.fill(haplotypeBases, (byte) REGEXP_WILDCARD); - final double[] baseQualities = new double[contextSize]; - Arrays.fill(baseQualities, 0.0); + final byte[] baseQualities = new byte[contextSize]; + Arrays.fill(baseQualities, (byte)0); byte[] readBases = read.getReadBases(); readBases = AlignmentUtils.readToAlignmentByteArray(read.getCigar(), readBases); // Adjust the read bases based on the Cigar string @@ -267,7 +266,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot readQuals[baseOffset] = (byte) 0; } // quals less than 5 are used as codes and don't have actual probabilistic meaning behind them haplotypeBases[i] = readBases[baseOffset]; - baseQualities[i] = (double) readQuals[baseOffset]; + baseQualities[i] = readQuals[baseOffset]; } return new Haplotype(haplotypeBases, baseQualities); @@ -286,10 +285,10 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot final int length = a.length; final byte[] consensusChars = new byte[length]; - final double[] consensusQuals = new double[length]; + final int[] consensusQuals = new int[length]; - final double[] qualsA = haplotypeA.getQuals(); - final double[] qualsB = haplotypeB.getQuals(); + final int[] qualsA = haplotypeA.getQuals(); + final int[] qualsB = haplotypeB.getQuals(); for (int i = 0; i < length; i++) { chA = a[i]; @@ -300,7 +299,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot if ((chA == wc) && (chB == wc)) { consensusChars[i] = wc; - consensusQuals[i] = 0.0; + consensusQuals[i] = 0; } else if ((chA == wc)) { consensusChars[i] = chB; consensusQuals[i] = qualsB[i]; @@ -433,7 +432,6 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot } - public List getKeyNames() { return Arrays.asList("HaplotypeScore"); } @@ -441,4 +439,46 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("HaplotypeScore", 1, VCFHeaderLineType.Float, "Consistency of the site with at most two segregating haplotypes")); } + + private static class Haplotype { + private final byte[] bases; + private final int[] quals; + private int qualitySum = -1; + + public Haplotype( final byte[] bases, final int[] quals ) { + this.bases = bases; + this.quals = quals; + } + + public Haplotype( final byte[] bases, final int qual ) { + this.bases = bases; + quals = new int[bases.length]; + Arrays.fill(quals, qual); + } + + public Haplotype( final byte[] bases, final byte[] quals ) { + this.bases = bases; + this.quals = new int[quals.length]; + for ( int i = 0 ; i < quals.length; i++ ) + this.quals[i] = (int)quals[i]; + } + + public double getQualitySum() { + if ( qualitySum == -1 ) { + qualitySum = 0; + for ( final int qual : quals ) { + qualitySum += qual; + } + } + return qualitySum; + } + + public int[] getQuals() { + return quals.clone(); + } + + public byte[] getBases() { + return bases.clone(); + } + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index 99dadea54..f03a25c04 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -52,6 +52,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.variant.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -214,10 +215,10 @@ public class VariantAnnotatorEngine { Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); // annotate db occurrences - vc = annotateDBs(tracker, ref, vc, infoAnnotations); + vc = annotateDBs(tracker, ref.getLocus(), vc, infoAnnotations); // annotate expressions where available - annotateExpressions(tracker, ref, infoAnnotations); + annotateExpressions(tracker, ref.getLocus(), infoAnnotations); // go through all the requested info annotationTypes for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { @@ -254,10 +255,22 @@ public class VariantAnnotatorEngine { return builder.genotypes(annotateGenotypes(null, null, null, vc, perReadAlleleLikelihoodMap)).make(); } - private VariantContext annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map infoAnnotations) { + public VariantContext annotateDBs(final RefMetaDataTracker tracker, final GenomeLoc loc, VariantContext vc) { + final Map newInfoAnnotations = new HashMap(0); + vc = annotateDBs(tracker, loc, vc, newInfoAnnotations); + + if ( !newInfoAnnotations.isEmpty() ) { + final VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(newInfoAnnotations); + vc = builder.make(); + } + + return vc; + } + + private VariantContext annotateDBs(final RefMetaDataTracker tracker, final GenomeLoc loc, VariantContext vc, final Map infoAnnotations) { for ( Map.Entry, String> dbSet : dbAnnotations.entrySet() ) { if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) { - final String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType()); + final String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), loc), vc.getType()); // add the ID if appropriate if ( rsID != null ) { @@ -273,7 +286,7 @@ public class VariantAnnotatorEngine { } } else { boolean overlapsComp = false; - for ( VariantContext comp : tracker.getValues(dbSet.getKey(), ref.getLocus()) ) { + for ( VariantContext comp : tracker.getValues(dbSet.getKey(), loc) ) { if ( !comp.isFiltered() && ( !requireStrictAlleleMatch || comp.getAlleles().equals(vc.getAlleles()) ) ) { overlapsComp = true; break; @@ -287,9 +300,9 @@ public class VariantAnnotatorEngine { return vc; } - private void annotateExpressions(RefMetaDataTracker tracker, ReferenceContext ref, Map infoAnnotations) { + private void annotateExpressions(final RefMetaDataTracker tracker, final GenomeLoc loc, final Map infoAnnotations) { for ( VAExpression expression : requestedExpressions ) { - Collection VCs = tracker.getValues(expression.binding, ref.getLocus()); + Collection VCs = tracker.getValues(expression.binding, loc); if ( VCs.size() == 0 ) continue; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java index 1e4c55e0d..b10daab58 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java @@ -938,7 +938,7 @@ public class DepthOfCoverage extends LocusWalker SAMUtils.MAX_PHRED_SCORE ) - throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName())); + throw new UserException.MisencodedBAM(p.getRead(), "we encountered an extremely high quality score (" + (int)qual + ")"); if ( capBaseQualsAtMappingQual ) qual = (byte)Math.min((int)qual, p.getMappingQual()); if ( (int)qual < minBaseQual ) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 439a9b3b8..26f2560b7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; +import net.sf.samtools.*; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -57,6 +58,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.filters.BadMateFilter; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; @@ -67,6 +69,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.clipping.ReadClipper; @@ -142,6 +145,17 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false) protected PrintStream graphWriter = null; + /** + * The assembled haplotypes will be written as BAM to this file if requested. Really for debugging purposes only. Note that the output here + * does not include uninformative reads so that not every input read is emitted to the bam. + */ + @Hidden + @Output(fullName="bamOutput", shortName="bam", doc="File to which assembled haplotypes should be written", required = false) + protected StingSAMFileWriter bamWriter = null; + private SAMFileHeader bamHeader = null; + private long uniqueNameCounter = 1; + private final static String readGroupId = "ArtificialHaplotype"; + /** * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. */ @@ -170,6 +184,16 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="downsampleRegion", shortName="dr", doc="coverage, per-sample, to downsample each active region to", required = false) protected int DOWNSAMPLE_PER_SAMPLE_PER_REGION = 1000; + /** + * If this flag is provided, the haplotype caller will include unmapped reads in the assembly and calling + * when these reads occur in the region being analyzed. Typically, for paired end analyses, one pair of the + * read can map, but if its pair is too divergent then it may be unmapped and placed next to its mate, taking + * the mates contig and alignment start. If this flag is provided the haplotype caller will see such reads, + * and may make use of them in assembly and calling, where possible. + */ + @Argument(fullName="includeUmappedReads", shortName="unmapped", doc="If provided, unmapped reads with chromosomal coordinates (i.e., those placed to their maps) will be included in the assembly and calling", required = false) + protected boolean includeUnmappedReads = false; + @Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false) protected boolean USE_ALLELES_TRIGGER = false; @@ -242,6 +266,8 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // the genotyping engine private GenotypingEngine genotypingEngine = null; + private VariantAnnotatorEngine annotationEngine = null; + // fasta reference reader to supplement the edges of the reference sequence private CachingIndexedFastaSequenceFile referenceReader; @@ -286,7 +312,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); // initialize the output VCF header - final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); + annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); Set headerInfo = new HashSet(); @@ -320,6 +346,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter, minKmer ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ); + + if ( bamWriter != null ) + setupBamWriter(); } //--------------------------------------------------------------------------------------------------------------- @@ -335,11 +364,20 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // enable non primary and extended reads in the active region @Override public EnumSet desiredReadStates() { - return EnumSet.of( - ActiveRegionReadState.PRIMARY, - ActiveRegionReadState.NONPRIMARY, - ActiveRegionReadState.EXTENDED - ); + if ( includeUnmappedReads ) { + throw new UserException.BadArgumentValue("includeUmappedReads", "is not yet functional"); +// return EnumSet.of( +// ActiveRegionReadState.PRIMARY, +// ActiveRegionReadState.NONPRIMARY, +// ActiveRegionReadState.EXTENDED, +// ActiveRegionReadState.UNMAPPED +// ); + } else + return EnumSet.of( + ActiveRegionReadState.PRIMARY, + ActiveRegionReadState.NONPRIMARY, + ActiveRegionReadState.EXTENDED + ); } @Override @@ -408,7 +446,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem //--------------------------------------------------------------------------------------------------------------- @Override - public Integer map( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker ) { + public Integer map( final ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker ) { if ( justDetermineActiveRegions ) // we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work return 1; @@ -429,8 +467,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && activeAllelesToGenotype.isEmpty() ) { return 0; } // No alleles found in this region so nothing to do! finalizeActiveRegion( activeRegion ); // merge overlapping fragments, clip adapter and low qual tails - final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader)); // Create the reference haplotype which is the bases from the reference that make up the active region - referenceHaplotype.setIsReference(true); + final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true); // Create the reference haplotype which is the bases from the reference that make up the active region final byte[] fullReferenceWithPadding = activeRegion.getFullReference(referenceReader, REFERENCE_PADDING); //int PRUNE_FACTOR = Math.max(MIN_PRUNE_FACTOR, determinePruneFactorFromCoverage( activeRegion )); final ArrayList haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, getPaddedLoc(activeRegion), MIN_PRUNE_FACTOR, activeAllelesToGenotype ); @@ -461,9 +498,31 @@ public class HaplotypeCaller extends ActiveRegionWalker implem activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) { + annotationEngine.annotateDBs(metaDataTracker, getToolkit().getGenomeLocParser().createGenomeLoc(call), call); vcfWriter.add( call ); } + if ( bamWriter != null ) { + // write the haplotypes to the bam + final GenomeLoc paddedRefLoc = getPaddedLoc(activeRegion); + for ( Haplotype haplotype : haplotypes ) + writeHaplotype(haplotype, paddedRefLoc, bestHaplotypes.contains(haplotype)); + + // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently + final Map alleleToHaplotypeMap = new HashMap(haplotypes.size()); + for ( final Haplotype haplotype : haplotypes ) + alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype); + + // next, output the interesting reads for each sample aligned against the appropriate haplotype + for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { + for ( Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { + final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue()); + if ( bestAllele != Allele.NO_CALL ) + writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele), paddedRefLoc.getStart()); + } + } + } + if( DEBUG ) { System.out.println("----------------------------------------------------------------------------------"); } return 1; // One active region was processed during this map call @@ -557,6 +616,95 @@ public class HaplotypeCaller extends ActiveRegionWalker implem return returnMap; } + private void setupBamWriter() { + // prepare the bam header + bamHeader = new SAMFileHeader(); + bamHeader.setSequenceDictionary(getToolkit().getSAMFileHeader().getSequenceDictionary()); + bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); + + // include the original read groups plus a new artificial one for the haplotypes + final List readGroups = new ArrayList(getToolkit().getSAMFileHeader().getReadGroups()); + final SAMReadGroupRecord rg = new SAMReadGroupRecord(readGroupId); + rg.setSample("HC"); + rg.setSequencingCenter("BI"); + readGroups.add(rg); + bamHeader.setReadGroups(readGroups); + + bamWriter.setPresorted(false); + bamWriter.writeHeader(bamHeader); + } + + private void writeHaplotype(final Haplotype haplotype, final GenomeLoc paddedRefLoc, final boolean isAmongBestHaplotypes) { + final GATKSAMRecord record = new GATKSAMRecord(bamHeader); + record.setReadBases(haplotype.getBases()); + record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef()); + record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length)); + record.setCigar(haplotype.getCigar()); + record.setMappingQuality(isAmongBestHaplotypes ? 60 : 0); + record.setReadName("HC" + uniqueNameCounter++); + record.setReadUnmappedFlag(false); + record.setReferenceIndex(paddedRefLoc.getContigIndex()); + record.setAttribute(SAMTag.RG.toString(), readGroupId); + record.setFlags(16); + bamWriter.addAlignment(record); + } + + private void writeReadAgainstHaplotype(final GATKSAMRecord read, final Haplotype haplotype, final int referenceStart) { + + final SWPairwiseAlignment swPairwiseAlignment = new SWPairwiseAlignment(haplotype.getBases(), read.getReadBases(), 5.0, -10.0, -22.0, -1.2); + final int readStartOnHaplotype = swPairwiseAlignment.getAlignmentStart2wrt1(); + final int readStartOnReference = referenceStart + haplotype.getAlignmentStartHapwrtRef() + readStartOnHaplotype; + read.setAlignmentStart(readStartOnReference); + + final Cigar cigar = generateReadCigarFromHaplotype(read, readStartOnHaplotype, haplotype.getCigar()); + read.setCigar(cigar); + + bamWriter.addAlignment(read); + } + + private Cigar generateReadCigarFromHaplotype(final GATKSAMRecord read, final int readStartOnHaplotype, final Cigar haplotypeCigar) { + + int currentReadPos = 0; + int currentHapPos = 0; + final List readCigarElements = new ArrayList(); + + for ( final CigarElement cigarElement : haplotypeCigar.getCigarElements() ) { + + if ( cigarElement.getOperator() == CigarOperator.D ) { + if ( currentReadPos > 0 ) + readCigarElements.add(cigarElement); + } else if ( cigarElement.getOperator() == CigarOperator.M || cigarElement.getOperator() == CigarOperator.I ) { + + final int elementLength = cigarElement.getLength(); + final int nextReadPos = currentReadPos + elementLength; + final int nextHapPos = currentHapPos + elementLength; + + // do we want this element? + if ( currentReadPos > 0 ) { + // do we want the entire element? + if ( nextReadPos < read.getReadLength() ) { + readCigarElements.add(cigarElement); + currentReadPos = nextReadPos; + } + // otherwise, we can finish up and return the cigar + else { + readCigarElements.add(new CigarElement(read.getReadLength() - currentReadPos, cigarElement.getOperator())); + return new Cigar(readCigarElements); + } + } + // do we want part of the element to start? + else if ( currentReadPos == 0 && nextHapPos > readStartOnHaplotype ) { + currentReadPos = Math.min(nextHapPos - readStartOnHaplotype, read.getReadLength()); + readCigarElements.add(new CigarElement(currentReadPos, cigarElement.getOperator())); + } + + currentHapPos = nextHapPos; + } + } + + return new Cigar(readCigarElements); + } + /* private int determinePruneFactorFromCoverage( final ActiveRegion activeRegion ) { final ArrayList readLengthDistribution = new ArrayList(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 8b844817d..57e071189 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -124,9 +124,14 @@ public class LikelihoodCalculationEngine { } private PerReadAlleleLikelihoodMap computeReadLikelihoods( final ArrayList haplotypes, final ArrayList reads) { + // first, a little set up to get copies of the Haplotypes that are Alleles (more efficient than creating them each time) + final int numHaplotypes = haplotypes.size(); + final Map alleleVersions = new HashMap(numHaplotypes); + for ( final Haplotype haplotype : haplotypes ) { + alleleVersions.put(haplotype, Allele.create(haplotype.getBases())); + } final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); - final int numHaplotypes = haplotypes.size(); for( final GATKSAMRecord read : reads ) { final byte[] overallGCP = new byte[read.getReadLength()]; Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data? @@ -138,20 +143,17 @@ public class LikelihoodCalculationEngine { readQuals[kkk] = ( readQuals[kkk] > (byte) read.getMappingQuality() ? (byte) read.getMappingQuality() : readQuals[kkk] ); // cap base quality by mapping quality //readQuals[kkk] = ( readQuals[kkk] > readInsQuals[kkk] ? readInsQuals[kkk] : readQuals[kkk] ); // cap base quality by base insertion quality, needs to be evaluated //readQuals[kkk] = ( readQuals[kkk] > readDelQuals[kkk] ? readDelQuals[kkk] : readQuals[kkk] ); // cap base quality by base deletion quality, needs to be evaluated + // TODO -- why is Q18 hard-coded here??? readQuals[kkk] = ( readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] ); } for( int jjj = 0; jjj < numHaplotypes; jjj++ ) { final Haplotype haplotype = haplotypes.get(jjj); - // TODO -- need to test against a reference/position with non-standard bases - //if ( !Allele.acceptableAlleleBases(haplotype.getBases(), false) ) - // continue; - final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) ); previousHaplotypeSeen = haplotype; - perReadAlleleLikelihoodMap.add(read, Allele.create(haplotype.getBases()), + perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0)); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java index e1a94eee7..e16994fa4 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java @@ -338,7 +338,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { for( final DefaultDirectedGraph graph : graphs ) { for ( final KBestPaths.Path path : KBestPaths.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) { - final Haplotype h = new Haplotype( path.getBases( graph ), path.getScore() ); + final Haplotype h = new Haplotype( path.getBases( graph ) ); if( addHaplotype( h, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, false ) ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java index 9ba74b98a..8547c0993 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java @@ -376,7 +376,7 @@ public class ValidationAmplicons extends RodWalker { if ( lowerCaseSNPs ) { sequence.append(Character.toLowerCase((char) ref.getBase())); } else { - sequence.append((char) BaseUtils.N); + sequence.append((char) BaseUtils.Base.N.base); } rawSequence.append(Character.toUpperCase((char) ref.getBase())); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java index 81a17b6ae..38fa060cc 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java @@ -57,6 +57,8 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.interval.IntervalMergingRule; +import org.broadinstitute.sting.utils.interval.IntervalSetRule; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import org.broadinstitute.variant.vcf.*; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; @@ -180,18 +182,47 @@ public class SelectHeaders extends RodWalker implements TreeRe headerLines = new LinkedHashSet(getSelectedHeaders(headerLines)); // Optionally add in the intervals. - if (includeIntervals && getToolkit().getArguments().intervals != null) { - for (IntervalBinding intervalBinding : getToolkit().getArguments().intervals) { - String source = intervalBinding.getSource(); - if (source == null) - continue; - File file = new File(source); - if (file.exists()) { - headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, FilenameUtils.getBaseName(file.getName()))); - } else { - headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, source)); + if (includeIntervals) { + IntervalArgumentCollection intervalArguments = getToolkit().getArguments().intervalArguments; + if (intervalArguments.intervals != null) { + for (IntervalBinding intervalBinding : intervalArguments.intervals) { + String source = intervalBinding.getSource(); + if (source == null) + continue; + File file = new File(source); + if (file.exists()) { + headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, FilenameUtils.getBaseName(file.getName()))); + } else { + headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, source)); + } } } + + if (intervalArguments.excludeIntervals != null) { + for (IntervalBinding intervalBinding : intervalArguments.excludeIntervals) { + String source = intervalBinding.getSource(); + if (source == null) + continue; + File file = new File(source); + if (file.exists()) { + headerLines.add(new VCFHeaderLine(VCFHeader.EXCLUDE_INTERVALS_KEY, FilenameUtils.getBaseName(file.getName()))); + } else { + headerLines.add(new VCFHeaderLine(VCFHeader.EXCLUDE_INTERVALS_KEY, source)); + } + } + } + + if (intervalArguments.intervalMerging != IntervalMergingRule.ALL) { + headerLines.add(new VCFHeaderLine(VCFHeader.INTERVAL_MERGING_KEY, String.valueOf(intervalArguments.intervalMerging))); + } + + if (intervalArguments.intervalSetRule != IntervalSetRule.UNION) { + headerLines.add(new VCFHeaderLine(VCFHeader.INTERVAL_SET_RULE_KEY, String.valueOf(intervalArguments.intervalSetRule))); + } + + if (intervalArguments.intervalPadding != 0) { + headerLines.add(new VCFHeaderLine(VCFHeader.INTERVAL_PADDING_KEY, String.valueOf(intervalArguments.intervalPadding))); + } } TreeSet vcfSamples = new TreeSet(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index a84019988..5b5a75d4e 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -363,7 +363,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("39c7a813fd6ee82d3604f2a868b35b2a")); + Arrays.asList("8231ae37b52b927db9fc1e5c221b0ba0")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -391,13 +391,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("3d3c5691973a223209a1341272d881be")); + Arrays.asList("a47810de2f6ef8087f4644064a0814bc")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("23b7a37a64065cee53a80495c8717eea")); + Arrays.asList("53b8d2b0fa63c5d1019855e8e0db28f0")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -497,18 +497,18 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("092e42a712afb660ec79ff11c55933e2")); + Arrays.asList("02175dc9731aed92837ce0db78489fc0")); executeTest("test calling on a ReducedRead BAM", spec); } @Test public void testReducedBamSNPs() { - testReducedCalling("SNP", "c0de74ab8f4f14eb3a2c5d55c200ac5f"); + testReducedCalling("SNP", "fe1af8b30b7f1a267f772b9aaf388f24"); } @Test public void testReducedBamINDELs() { - testReducedCalling("INDEL", "1c9aaf65ffaa12bb766855265a1c3f8e"); + testReducedCalling("INDEL", "a85c110fcac9574a54c7daccb1e2d5ae"); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 8f5e275e6..27fe31fa7 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -50,6 +50,7 @@ import org.broadinstitute.sting.WalkerTest; import org.testng.annotations.Test; import java.util.Arrays; +import java.util.Collections; public class HaplotypeCallerIntegrationTest extends WalkerTest { final static String REF = b37KGReference; @@ -75,6 +76,11 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { HCTest(NA12878_BAM, "", "a2c63f6e6e51a01019bdbd23125bdb15"); } + @Test(enabled = false) + public void testHaplotypeCallerSingleSampleWithDbsnp() { + HCTest(NA12878_BAM, "-D " + b37dbSNP132, ""); + } + @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", @@ -151,6 +157,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { executeTest("HCTestStructuralIndels: ", spec); } + @Test + public void HCTestDoesNotFailOnBadRefBase() { + // don't care about the output - just want to make sure it doesn't fail + final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "NA12878.readsOverBadBase.chr3.bam") + " --no_cmdline_in_header -o /dev/null -L 3:60830000-60840000 --minPruning 3 -stand_call_conf 2 -stand_emit_conf 2"; + final WalkerTestSpec spec = new WalkerTestSpec(base, Collections.emptyList()); + executeTest("HCTestDoesNotFailOnBadRefBase: ", spec); + } + // -------------------------------------------------------------------------------------------------------------- // // testing reduced reads @@ -164,4 +178,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { Arrays.asList("8a400b0c46f41447fcc35a907e34f384")); executeTest("HC calling on a ReducedRead BAM", spec); } + + @Test + public void testReducedBamWithReadsNotFullySpanningDeletion() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, + Arrays.asList("4e8121dd9dc90478f237bd6ae4d19920")); + executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java index 28f128dd3..6db44efd5 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java @@ -111,8 +111,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest { private Pair getData1() { - Allele reference_A = Allele.create(BaseUtils.A,true); - Allele alt_C = Allele.create(BaseUtils.C); + Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_C = Allele.create(BaseUtils.Base.C.base); Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(reference_A,alt_C)); @@ -160,9 +160,9 @@ public class ConcordanceMetricsUnitTest extends BaseTest { private Pair getData2() { - Allele reference_A = Allele.create(BaseUtils.A,true); - Allele alt_C = Allele.create(BaseUtils.C); - Allele alt_T = Allele.create(BaseUtils.T); + Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_C = Allele.create(BaseUtils.Base.C.base); + Allele alt_T = Allele.create(BaseUtils.Base.T.base); Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(reference_A,alt_T)); @@ -213,10 +213,10 @@ public class ConcordanceMetricsUnitTest extends BaseTest { private Pair getData3() { - Allele reference_ACT = Allele.create(new byte[]{BaseUtils.A,BaseUtils.C,BaseUtils.T},true); - Allele alt_AC = Allele.create(new byte[]{BaseUtils.A,BaseUtils.C}); - Allele alt_A = Allele.create(BaseUtils.A); - Allele alt_ATT = Allele.create(new byte[]{BaseUtils.A,BaseUtils.T,BaseUtils.T}); + Allele reference_ACT = Allele.create(new byte[]{BaseUtils.Base.A.base,BaseUtils.Base.C.base,BaseUtils.Base.T.base},true); + Allele alt_AC = Allele.create(new byte[]{BaseUtils.Base.A.base,BaseUtils.Base.C.base}); + Allele alt_A = Allele.create(BaseUtils.Base.A.base); + Allele alt_ATT = Allele.create(new byte[]{BaseUtils.Base.A.base,BaseUtils.Base.T.base,BaseUtils.Base.T.base}); Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_ACT,alt_ATT)); Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(alt_A,alt_A)); @@ -267,9 +267,9 @@ public class ConcordanceMetricsUnitTest extends BaseTest { private Pair getData4() { - Allele reference_A = Allele.create(BaseUtils.A,true); - Allele alt_C = Allele.create(BaseUtils.C); - Allele alt_T = Allele.create(BaseUtils.T); + Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_C = Allele.create(BaseUtils.Base.C.base); + Allele alt_T = Allele.create(BaseUtils.Base.T.base); Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(Allele.NO_CALL,Allele.NO_CALL)); @@ -316,9 +316,9 @@ public class ConcordanceMetricsUnitTest extends BaseTest { private Pair getData5() { - Allele reference_A = Allele.create(BaseUtils.A,true); - Allele alt_C = Allele.create(BaseUtils.C); - Allele alt_T = Allele.create(BaseUtils.T); + Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_C = Allele.create(BaseUtils.Base.C.base); + Allele alt_T = Allele.create(BaseUtils.Base.T.base); Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", new ArrayList(0)); @@ -368,8 +368,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest { private List> getData6() { - Allele reference_A = Allele.create(BaseUtils.A,true); - Allele alt_C = Allele.create(BaseUtils.C); + Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_C = Allele.create(BaseUtils.Base.C.base); // site 1 - @@ -396,8 +396,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest { Pair testDataSite1 = new Pair(eval_1_builder.make(),truth_1_builder.make()); - reference_A = Allele.create(BaseUtils.A,true); - Allele alt_T = Allele.create(BaseUtils.T); + reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_T = Allele.create(BaseUtils.Base.T.base); // site 2 - // sample 1: no-call/hom-ref @@ -421,7 +421,7 @@ public class ConcordanceMetricsUnitTest extends BaseTest { Pair testDataSite2 = new Pair(eval_1_builder.make(),truth_1_builder.make()); - Allele alt_G = Allele.create(BaseUtils.G); + Allele alt_G = Allele.create(BaseUtils.Base.G.base); // site 3 - // sample 1: alleles do not match @@ -605,10 +605,10 @@ public class ConcordanceMetricsUnitTest extends BaseTest { public List> getData7() { - Allele ref1 = Allele.create(BaseUtils.T,true); - Allele alt1 = Allele.create(BaseUtils.C); - Allele alt2 = Allele.create(BaseUtils.G); - Allele alt3 = Allele.create(BaseUtils.A); + Allele ref1 = Allele.create(BaseUtils.Base.T.base,true); + Allele alt1 = Allele.create(BaseUtils.Base.C.base); + Allele alt2 = Allele.create(BaseUtils.Base.G.base); + Allele alt3 = Allele.create(BaseUtils.Base.A.base); GenomeLoc loc1 = genomeLocParser.createGenomeLoc("chr1",1,1); VariantContextBuilder site1Eval = new VariantContextBuilder(); diff --git a/public/java/src/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java b/public/java/src/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java new file mode 100644 index 000000000..3f76ae652 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java @@ -0,0 +1,70 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.commandline; + +import org.broad.tribble.Feature; +import org.broadinstitute.sting.utils.interval.IntervalMergingRule; +import org.broadinstitute.sting.utils.interval.IntervalSetRule; + +import java.util.List; + +public class IntervalArgumentCollection { + /** + * Using this option one can instruct the GATK engine to traverse over only part of the genome. This argument can be specified multiple times. + * One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals). + * Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf). + * To specify the completely unmapped reads in the BAM file (i.e. those without a reference contig) use -L unmapped. + */ + @Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false) + public List> intervals = null; + + /** + * Using this option one can instruct the GATK engine NOT to traverse over certain parts of the genome. This argument can be specified multiple times. + * One may use samtools-style intervals either explicitly (e.g. -XL chr1 or -XL chr1:100-200) or listed in a file (e.g. -XL myFile.intervals). + * Additionally, one may specify a rod file to skip over the positions for which there is a record in the file (e.g. -XL file.vcf). + */ + @Input(fullName = "excludeIntervals", shortName = "XL", doc = "One or more genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file (including a rod file)", required = false) + public List> excludeIntervals = null; + + /** + * How should the intervals specified by multiple -L or -XL arguments be combined? Using this argument one can, for example, traverse over all of the positions + * for which there is a record in a VCF but just in chromosome 20 (-L chr20 -L file.vcf -isr INTERSECTION). + */ + @Argument(fullName = "interval_set_rule", shortName = "isr", doc = "Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs", required = false) + public IntervalSetRule intervalSetRule = IntervalSetRule.UNION; + + /** + * Should abutting (but not overlapping) intervals be treated as separate intervals? + */ + @Argument(fullName = "interval_merging", shortName = "im", doc = "Indicates the interval merging rule we should use for abutting intervals", required = false) + public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL; + + /** + * For example, '-L chr1:100' with a padding value of 20 would turn into '-L chr1:80-120'. + */ + @Argument(fullName = "interval_padding", shortName = "ip", doc = "Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument", required = false) + public int intervalPadding = 0; +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index a5926aeae..9b801be7d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -55,7 +55,6 @@ import org.broadinstitute.sting.gatk.samples.SampleDBBuilder; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.classloader.PluginManager; -import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.interval.IntervalUtils; @@ -361,7 +360,6 @@ public class GenomeAnalysisEngine { * Returns a list of active, initialized read transformers * * @param walker the walker we need to apply read transformers too - * @return a non-null list of read transformers */ public void initializeReadTransformers(final Walker walker) { final List activeTransformers = new ArrayList(); @@ -672,41 +670,7 @@ public class GenomeAnalysisEngine { * Setup the intervals to be processed */ protected void initializeIntervals() { - // return if no interval arguments at all - if ( argCollection.intervals == null && argCollection.excludeIntervals == null ) - return; - - // Note that the use of '-L all' is no longer supported. - - // if include argument isn't given, create new set of all possible intervals - - final Pair includeExcludePair = IntervalUtils.parseIntervalBindingsPair( - this.referenceDataSource, - argCollection.intervals, - argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, - argCollection.excludeIntervals); - - final GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst(); - final GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond(); - - // if no exclude arguments, can return parseIntervalArguments directly - if ( excludeSortedSet == null ) - intervals = includeSortedSet; - - // otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets - else { - intervals = includeSortedSet.subtractRegions(excludeSortedSet); - - // logging messages only printed when exclude (-XL) arguments are given - final long toPruneSize = includeSortedSet.coveredSize(); - final long toExcludeSize = excludeSortedSet.coveredSize(); - final long intervalSize = intervals.coveredSize(); - logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize)); - logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)", - toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize))); - } - - logger.info(String.format("Processing %d bp from intervals", intervals.coveredSize())); + intervals = IntervalUtils.parseIntervalArguments(this.referenceDataSource, argCollection.intervalArguments); } /** @@ -842,7 +806,7 @@ public class GenomeAnalysisEngine { if (argCollection.keepProgramRecords) removeProgramRecords = false; - final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker && argCollection.newART; + final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker; return new SAMDataSource( samReaderIDs, diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index b6f0d5f90..62ca38ad2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -26,11 +26,7 @@ package org.broadinstitute.sting.gatk.arguments; import net.sf.samtools.SAMFileReader; -import org.broad.tribble.Feature; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.IntervalBinding; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; @@ -38,8 +34,6 @@ import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; import org.broadinstitute.sting.gatk.samples.PedigreeValidationType; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.interval.IntervalMergingRule; -import org.broadinstitute.sting.utils.interval.IntervalSetRule; import java.io.File; import java.util.ArrayList; @@ -100,41 +94,8 @@ public class GATKArgumentCollection { @Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false) public List readFilters = new ArrayList(); - /** - * Using this option one can instruct the GATK engine to traverse over only part of the genome. This argument can be specified multiple times. - * One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals). - * Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf). - * To specify the completely unmapped reads in the BAM file (i.e. those without a reference contig) use -L unmapped. - */ - @Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false) - public List> intervals = null; - - /** - * Using this option one can instruct the GATK engine NOT to traverse over certain parts of the genome. This argument can be specified multiple times. - * One may use samtools-style intervals either explicitly (e.g. -XL chr1 or -XL chr1:100-200) or listed in a file (e.g. -XL myFile.intervals). - * Additionally, one may specify a rod file to skip over the positions for which there is a record in the file (e.g. -XL file.vcf). - */ - @Input(fullName = "excludeIntervals", shortName = "XL", doc = "One or more genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file (including a rod file)", required = false) - public List> excludeIntervals = null; - - /** - * How should the intervals specified by multiple -L or -XL arguments be combined? Using this argument one can, for example, traverse over all of the positions - * for which there is a record in a VCF but just in chromosome 20 (-L chr20 -L file.vcf -isr INTERSECTION). - */ - @Argument(fullName = "interval_set_rule", shortName = "isr", doc = "Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs", required = false) - public IntervalSetRule intervalSetRule = IntervalSetRule.UNION; - - /** - * Should abutting (but not overlapping) intervals be treated as separate intervals? - */ - @Argument(fullName = "interval_merging", shortName = "im", doc = "Indicates the interval merging rule we should use for abutting intervals", required = false) - public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL; - - /** - * For example, '-L chr1:100' with a padding value of 20 would turn into '-L chr1:80-120'. - */ - @Argument(fullName = "interval_padding", shortName = "ip", doc = "Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument", required = false) - public int intervalPadding = 0; + @ArgumentCollection + public IntervalArgumentCollection intervalArguments = new IntervalArgumentCollection(); @Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false) public File referenceFile = null; @@ -448,10 +409,5 @@ public class GATKArgumentCollection { @Hidden public boolean generateShadowBCF = false; // TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed - - @Hidden - @Argument(fullName="newART", shortName = "newART", doc = "use the new ART traversal", required=false) - public boolean newART = false; - } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java index 7d3cac33d..09b72f5eb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java @@ -77,13 +77,17 @@ public class ManagingReferenceOrderedView implements ReferenceOrderedView { * @return A tracker containing information about this locus. */ public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc, ReferenceContext referenceContext ) { - List bindings = states.isEmpty() ? Collections.emptyList() : new ArrayList(states.size()); + if ( states.isEmpty() ) + return RefMetaDataTracker.EMPTY_TRACKER; + else { + List bindings = new ArrayList(states.size()); - for ( ReferenceOrderedDataState state: states ) - // todo -- warning, I removed the reference to the name from states - bindings.add( state.iterator.seekForward(loc) ); + for ( ReferenceOrderedDataState state: states ) + // todo -- warning, I removed the reference to the name from states + bindings.add( state.iterator.seekForward(loc) ); - return new RefMetaDataTracker(bindings); + return new RefMetaDataTracker(bindings); + } } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java index 2c03363ba..5b4c2afda 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java @@ -95,7 +95,10 @@ public abstract class Shard implements HasGenomeLocation { */ private final Map fileSpans; - + /** + * Lazy-calculated span of all of the genome locs in this shard + */ + private GenomeLoc spanningLocation = null; /** * Statistics about which reads in this shards were used and which were filtered away. @@ -148,27 +151,34 @@ public abstract class Shard implements HasGenomeLocation { /** * Returns the span of the genomeLocs comprising this shard - * @param - * @return + * @return a GenomeLoc that starts as the first position in getGenomeLocs() and stops at the stop of the last + * position in getGenomeLocs() */ public GenomeLoc getLocation() { - if ( getGenomeLocs() == null ) - return GenomeLoc.WHOLE_GENOME; + if ( spanningLocation == null ) { + if ( getGenomeLocs() == null ) + spanningLocation = GenomeLoc.WHOLE_GENOME; + else if ( getGenomeLocs().size() == 0 ) { + spanningLocation = getGenomeLocs().get(0); + } else { + int start = Integer.MAX_VALUE; + int stop = Integer.MIN_VALUE; + String contig = null; - int start = Integer.MAX_VALUE; - int stop = Integer.MIN_VALUE; - String contig = null; + for ( GenomeLoc loc : getGenomeLocs() ) { + if ( GenomeLoc.isUnmapped(loc) ) + // special case the unmapped region marker, just abort out + return loc; + contig = loc.getContig(); + if ( loc.getStart() < start ) start = loc.getStart(); + if ( loc.getStop() > stop ) stop = loc.getStop(); + } - for ( GenomeLoc loc : getGenomeLocs() ) { - if ( GenomeLoc.isUnmapped(loc) ) - // special case the unmapped region marker, just abort out - return loc; - contig = loc.getContig(); - if ( loc.getStart() < start ) start = loc.getStart(); - if ( loc.getStop() > stop ) stop = loc.getStop(); + spanningLocation = parser.createGenomeLoc(contig, start, stop); + } } - return parser.createGenomeLoc(contig, start, stop); + return spanningLocation; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java index 600834012..b06d5f5b4 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java @@ -27,8 +27,8 @@ package org.broadinstitute.sting.gatk.downsampling; import net.sf.samtools.SAMRecord; -import java.util.ArrayList; import java.util.Collection; +import java.util.LinkedList; import java.util.List; /** @@ -41,7 +41,7 @@ import java.util.List; */ public class PassThroughDownsampler implements ReadsDownsampler { - private ArrayList selectedReads; + private LinkedList selectedReads; public PassThroughDownsampler() { clear(); @@ -59,9 +59,13 @@ public class PassThroughDownsampler implements ReadsDownsam } public boolean hasFinalizedItems() { - return selectedReads.size() > 0; + return ! selectedReads.isEmpty(); } + /** + * Note that this list is a linked list and so doesn't support fast random access + * @return + */ public List consumeFinalizedItems() { // pass by reference rather than make a copy, for speed List downsampledItems = selectedReads; @@ -74,7 +78,7 @@ public class PassThroughDownsampler implements ReadsDownsam } public T peekFinalized() { - return selectedReads.isEmpty() ? null : selectedReads.get(0); + return selectedReads.isEmpty() ? null : selectedReads.getFirst(); } public T peekPending() { @@ -90,7 +94,7 @@ public class PassThroughDownsampler implements ReadsDownsam } public void clear() { - selectedReads = new ArrayList(); + selectedReads = new LinkedList(); } public void reset() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java index 0d7a0dd14..4331fd723 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java @@ -29,9 +29,7 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; +import java.util.*; /** * Reservoir Downsampler: Selects n reads out of a stream whose size is not known in advance, with @@ -42,10 +40,25 @@ import java.util.List; * @author David Roazen */ public class ReservoirDownsampler implements ReadsDownsampler { + private final int targetSampleSize; - private ArrayList reservoir; + /** + * if true, this downsampler will be optimized for the case + * where most of the time we won't fill up anything like the + * targetSampleSize elements. If this is false, we will allocate + * internal buffers to targetSampleSize initially, which minimizes + * the cost of allocation if we often use targetSampleSize or more + * elements. + */ + private final boolean expectFewOverflows; - private int targetSampleSize; + /** + * At times this can be a linked list or an array list, depending on how we're accessing the + * data and whether or not we're expecting few overflows + */ + private List reservoir; + + private boolean isLinkedList; private int totalReadsSeen; @@ -56,17 +69,35 @@ public class ReservoirDownsampler implements ReadsDownsampl * * @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained * after downsampling will be min(totalReads, targetSampleSize) + * @param expectFewOverflows if true, this downsampler will be optimized for the case + * where most of the time we won't fill up anything like the + * targetSampleSize elements. If this is false, we will allocate + * internal buffers to targetSampleSize initially, which minimizes + * the cost of allocation if we often use targetSampleSize or more + * elements. */ - public ReservoirDownsampler ( int targetSampleSize ) { + public ReservoirDownsampler ( final int targetSampleSize, final boolean expectFewOverflows) { if ( targetSampleSize <= 0 ) { throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0"); } this.targetSampleSize = targetSampleSize; + this.expectFewOverflows = expectFewOverflows; clear(); reset(); } + /** + * Construct a ReservoirDownsampler + * + * @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained + * after downsampling will be min(totalReads, targetSampleSize) + */ + public ReservoirDownsampler ( int targetSampleSize ) { + this(targetSampleSize, false); + } + + public void submit ( T newRead ) { totalReadsSeen++; @@ -74,7 +105,12 @@ public class ReservoirDownsampler implements ReadsDownsampl reservoir.add(newRead); } else { - int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalReadsSeen); + if ( isLinkedList ) { + reservoir = new ArrayList(reservoir); + isLinkedList = false; + } + + final int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalReadsSeen); if ( randomSlot < targetSampleSize ) { reservoir.set(randomSlot, newRead); } @@ -93,10 +129,15 @@ public class ReservoirDownsampler implements ReadsDownsampl } public List consumeFinalizedItems() { - // pass by reference rather than make a copy, for speed - List downsampledItems = reservoir; - clear(); - return downsampledItems; + if ( reservoir.isEmpty() ) { + // if there's nothing here, don't both allocating a new list completely + return Collections.emptyList(); + } else { + // pass by reference rather than make a copy, for speed + List downsampledItems = reservoir; + clear(); + return downsampledItems; + } } public boolean hasPendingItems() { @@ -119,9 +160,18 @@ public class ReservoirDownsampler implements ReadsDownsampl // NO-OP } + /** + * Clear the data structures used to hold information + */ public void clear() { - reservoir = new ArrayList(targetSampleSize); - totalReadsSeen = 0; // an internal stat used by the downsampling process, so not cleared by reset() below + // if we aren't expecting many overflows, allocate a linked list not an arraylist + reservoir = expectFewOverflows ? new LinkedList() : new ArrayList(targetSampleSize); + + // it's a linked list if we allocate one + isLinkedList = expectFewOverflows; + + // an internal stat used by the downsampling process, so not cleared by reset() below + totalReadsSeen = 0; } public void reset() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index c127899f6..371cce778 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -245,12 +245,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { } else if (walker instanceof ReadPairWalker) { return new TraverseReadPairs(); } else if (walker instanceof ActiveRegionWalker) { - if ( engine.getArguments().newART ) { - // todo -- create optimized traversal - return new TraverseActiveRegionsOptimized(); - } else { - return new TraverseActiveRegionsOriginal(); - } + return new TraverseActiveRegions(); } else { throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type."); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java index 9cb38b840..5a1b015fe 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java @@ -61,6 +61,7 @@ public class RefMetaDataTracker { final Map bindings; final protected static Logger logger = Logger.getLogger(RefMetaDataTracker.class); + public final static RefMetaDataTracker EMPTY_TRACKER = new RefMetaDataTracker(); // ------------------------------------------------------------------------------------------ // diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index c7edebd81..a77341a5d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -194,17 +194,18 @@ public class VariantContextAdaptors { return null; // we weren't given enough reference context to create the VariantContext final byte refBaseForIndel = ref.getBases()[index]; + final boolean refBaseIsDash = dbsnp.getNCBIRefBase().equals("-"); boolean addPaddingBase; if ( isSNP(dbsnp) || isMNP(dbsnp) ) addPaddingBase = false; else if ( isIndel(dbsnp) || dbsnp.getVariantType().contains("mixed") ) - addPaddingBase = VariantContextUtils.requiresPaddingBase(stripNullDashes(getAlleleList(dbsnp))); + addPaddingBase = refBaseIsDash || VariantContextUtils.requiresPaddingBase(stripNullDashes(getAlleleList(dbsnp))); else return null; // can't handle anything else Allele refAllele; - if ( dbsnp.getNCBIRefBase().equals("-") ) + if ( refBaseIsDash ) refAllele = Allele.create(refBaseForIndel, true); else if ( ! Allele.acceptableAlleleBases(dbsnp.getNCBIRefBase()) ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 45dbb6dc8..a7e4d7649 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -25,12 +25,14 @@ package org.broadinstitute.sting.gatk.traversals; +import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.WalkerManager; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.providers.*; +import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; @@ -43,36 +45,39 @@ import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import java.util.LinkedList; -import java.util.List; +import java.util.*; /** - * Created with IntelliJ IDEA. + * Implement active region traversal + * * User: depristo * Date: 1/9/13 * Time: 4:45 PM - * To change this template use File | Settings | File Templates. + * + * Live region: + * + * The ART tracks a thing called the live region. The live region is a position on a specific contig + * of the alignment start of the last read we processed during this traversal. Because the + * read stream is sorted, future reads must occurs in the the live region. Therefore the the dead region + * (everything to the left of the live boundary) cannot have any more read data. The live / dead + * regions are used to decide when we can safely call map on active regions, as only active regions + * contained completely within the dead region (including extensions) have a complete set of read data + * in the collected read list. All of the data related to the live region is captured by the local + * variable spanOfLastReadSeen + * */ -public abstract class TraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> { +public class TraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> { + protected final static Logger logger = Logger.getLogger(TraversalEngine.class); protected final static boolean DEBUG = false; // set by the tranversal private int activeRegionExtension = -1; private int maxRegionSize = -1; - /** - * our log, which we want to capture anything from this class - */ - protected final static Logger logger = Logger.getLogger(TraversalEngine.class); - protected final LinkedList workQueue = new LinkedList(); + private final LinkedList workQueue = new LinkedList(); - abstract protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker); - - /** - * Special function called in LinearMicroScheduler to empty out the work queue. - * Ugly for now but will be cleaned up when we push this functionality more into the engine - */ - public abstract T endTraversal(final Walker walker, T sum); + private LinkedList myReads = new LinkedList(); + private GenomeLoc spanOfLastReadSeen = null; protected int getActiveRegionExtension() { return activeRegionExtension; @@ -87,6 +92,11 @@ public abstract class TraverseActiveRegions extends TraversalEngine extends TraversalEngine walker, + final LocusShardDataProvider dataProvider, + T sum) { + logger.debug(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider)); + + final LocusView locusView = new AllLocusView(dataProvider); + + final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); + + final List activeRegions = new LinkedList(); + ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); + + ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); + + // We keep processing while the next reference location is within the interval + final GenomeLoc locOfLastReadAtTraversalStart = spanOfLastSeenRead(); + + // if we've moved onto a new contig, process all of the active regions + if ( onNewContig(dataProvider.getShard()) ) + sum = processActiveRegions(walker, sum, true); + + GenomeLoc prevLoc = null; + while( locusView.hasNext() ) { + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); + + // Grab all the previously unseen reads from this pileup and add them to the massive read list + // Note that this must occur before we leave because we are outside the intervals because + // reads may occur outside our intervals but overlap them in the future + final Collection reads = locusView.getLIBS().transferReadsFromAllPreviousPileups(); + for( final GATKSAMRecord read : reads ) { + if ( appearedInLastShard(locOfLastReadAtTraversalStart, read) ) { + if ( DEBUG ) logger.warn("Skipping duplicated " + read.getReadName()); + } else { + if ( DEBUG ) logger.warn("Adding read " + read.getReadName() + " at " + engine.getGenomeLocParser().createGenomeLoc(read) + " from provider " + dataProvider); + rememberLastReadLocation(read); + myReads.add(read); + } + } + + // skip this location -- it's not part of our engine intervals + if ( outsideEngineIntervals(location) ) + continue; + + if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { + // we've move across some interval boundary, restart profile + profile = incorporateActiveRegions(profile, activeRegions); + } + + dataProvider.getShard().getReadMetrics().incrementNumIterations(); + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); + + // Call the walkers isActive function for this locus and add them to the list to be integrated later + profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); + + prevLoc = location; + + printProgress(locus.getLocation()); + } + + updateCumulativeMetrics(dataProvider.getShard()); + + if ( ! profile.isEmpty() ) + incorporateActiveRegions(profile, activeRegions); + + // add active regions to queue of regions to process + // first check if can merge active regions over shard boundaries + if( !activeRegions.isEmpty() ) { + if( !workQueue.isEmpty() ) { + final ActiveRegion last = workQueue.getLast(); + final ActiveRegion first = activeRegions.get(0); + if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= getMaxRegionSize() ) { + workQueue.removeLast(); + activeRegions.remove(first); + workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), getActiveRegionExtension()) ); + } + } + workQueue.addAll( activeRegions ); + } + + logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); + + // now go and process all of the active regions + sum = processActiveRegions(walker, sum, false); + + return sum; + } + + /** + * Special function called in LinearMicroScheduler to empty out the work queue. + * Ugly for now but will be cleaned up when we push this functionality more into the engine + */ + public T endTraversal(final Walker walker, T sum) { + return processActiveRegions((ActiveRegionWalker)walker, sum, true); + } + + // ------------------------------------------------------------------------------------- + // + // Functions to manage and interact with the live / dead zone + // + // ------------------------------------------------------------------------------------- + + /** + * Update the live region to reflect that the last read we've seen in the traversal is read + * + * Requires that sequential calls always be provided reads in coordinate sorted order + * + * @param read the last read we've seen during the traversal + */ + protected void rememberLastReadLocation(final GATKSAMRecord read) { + final GenomeLoc currentLocation = engine.getGenomeLocParser().createGenomeLoc(read); + if ( spanOfLastReadSeen == null ) + spanOfLastReadSeen = currentLocation; + else { + if ( currentLocation.isBefore(spanOfLastReadSeen) ) + throw new IllegalStateException("Updating last read seen in the traversal with read " + read + " with span " + currentLocation + " but this occurs before the previously seen read " + spanOfLastReadSeen); + spanOfLastReadSeen = currentLocation; + } + } + + /** + * Get a GenomeLoc indicating the start (heading to the right) of the live ART region. + * @return the left-most position of the live region on the genome + */ + protected GenomeLoc spanOfLastSeenRead() { + return spanOfLastReadSeen; + } + + /** + * Is the active region completely within the traversal's dead zone? + * + * @param region the region we want to test + * @return true if the extended location of region is completely within the current dead zone, false otherwise + */ + protected boolean regionCompletelyWithinDeadZone(final ActiveRegion region) { + return region.getExtendedLoc().getStop() < spanOfLastSeenRead().getStart() + || ! region.getExtendedLoc().onSameContig(spanOfLastSeenRead()); + } + + /** + * Is the read dead? That is, can it no longer be in any future active region, and therefore can be discarded? + * + * read: start |--------> stop ------ stop + extension + * region: start |-----------------| end + * + * Since the regions are coming in order, read could potentially be contained in a future interval if + * stop + activeRegionExtension >= end. If, on the other hand, stop + extension is < the end + * of this region, then we can discard it, since any future region could only include reads + * up to end + 1 - extension. + * + * Note that this function doesn't care about the dead zone. We're assuming that by + * actually calling this function with an active region that region is already in the dead zone, + * so checking that the read is in the dead zone doesn't make sense. + * + * @param read the read we're testing + * @param activeRegion the current active region + * @return true if the read is dead, false other + */ + @Requires({"read != null", "activeRegion != null"}) + private boolean readCannotOccurInAnyMoreActiveRegions(final GATKSAMRecord read, final ActiveRegion activeRegion) { + return read.getAlignmentEnd() + getActiveRegionExtension() < activeRegion.getLocation().getStop(); + } + + // ------------------------------------------------------------------------------------- + // + // Functions to process active regions that are ready for map / reduce calls + // + // ------------------------------------------------------------------------------------- + + private T processActiveRegions(final ActiveRegionWalker walker, T sum, final boolean forceRegionsToBeActive) { + if( walker.activeRegionOutStream != null ) { + writeActiveRegionsToStream(walker); + return sum; + } else { + return callWalkerMapOnActiveRegions(walker, sum, forceRegionsToBeActive); + } + } + + private T callWalkerMapOnActiveRegions(final ActiveRegionWalker walker, T sum, final boolean forceRegionsToBeActive) { + // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them + // TODO can implement parallel traversal here + while( workQueue.peek() != null ) { + final ActiveRegion activeRegion = workQueue.peek(); + if ( forceRegionsToBeActive || regionCompletelyWithinDeadZone(activeRegion) ) { + if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + spanOfLastSeenRead()); + sum = processActiveRegion( workQueue.remove(), sum, walker ); + } else { + break; + } + } + + return sum; + } + + protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker) { + final Iterator liveReads = myReads.iterator(); + while ( liveReads.hasNext() ) { + boolean killed = false; + final GATKSAMRecord read = liveReads.next(); + final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); + + if( activeRegion.getLocation().overlapsP( readLoc ) ) { + activeRegion.add(read); + + if ( ! walker.wantsNonPrimaryReads() ) { + if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + spanOfLastSeenRead()); + liveReads.remove(); + killed = true; + } + } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) { + activeRegion.add( read ); + } + + if ( ! killed && readCannotOccurInAnyMoreActiveRegions(read, activeRegion) ) { + if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + spanOfLastSeenRead()); + liveReads.remove(); + } + } + + logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); + final M x = walker.map(activeRegion, null); + return walker.reduce( x, sum ); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimized.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimized.java deleted file mode 100644 index 809c7ea6a..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimized.java +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright (c) 2012 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.traversals; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.*; -import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.activeregion.ActivityProfile; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: 12/9/11 - */ - -public class TraverseActiveRegionsOptimized extends TraverseActiveRegions { - private LinkedList myReads = new LinkedList(); - private Shard lastShard = null; - - @Override - public T traverse( final ActiveRegionWalker walker, - final LocusShardDataProvider dataProvider, - T sum) { - if ( DEBUG ) logger.warn(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider)); - - final HashSet maybeDuplicatedReads = new HashSet(); - // TODO -- there's got to be a better way to know this - if ( lastShard != dataProvider.getShard() ) { - maybeDuplicatedReads.addAll(myReads); - logger.info("Crossing shard boundary requires us to check for duplicates against " + maybeDuplicatedReads.size() + " reads"); - if ( DEBUG ) logger.warn("Clearing myReads"); - } - lastShard = dataProvider.getShard(); - - final LocusView locusView = new AllLocusView(dataProvider); - - final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); - - final List activeRegions = new LinkedList(); - ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); - - ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); - - // We keep processing while the next reference location is within the interval - GenomeLoc prevLoc = null; - while( locusView.hasNext() ) { - final AlignmentContext locus = locusView.next(); - final GenomeLoc location = locus.getLocation(); - - // Grab all the previously unseen reads from this pileup and add them to the massive read list - // Note that this must occur before we leave because we are outside the intervals because - // reads may occur outside our intervals but overlap them in the future - final Collection reads = locusView.getLIBS().transferReadsFromAllPreviousPileups(); - for( final GATKSAMRecord read : reads ) { - notifyOfCurrentPosition(read); - // most of the time maybeDuplicatedReads is empty - // TODO -- I believe that because of the ordering of reads that as soon as we don't find a read in the - // TODO -- potential list of duplicates we can clear the hashset - if ( ! maybeDuplicatedReads.isEmpty() && maybeDuplicatedReads.contains(read) ) { - if ( DEBUG ) logger.warn("Skipping duplicated " + read.getReadName()); - } else { - if ( DEBUG ) logger.warn("Adding read " + read.getReadName() + " at " + engine.getGenomeLocParser().createGenomeLoc(read) + " from provider " + dataProvider); - myReads.add((GATKSAMRecord)read); - } - } - - // skip this location -- it's not part of our engine intervals - if ( outsideEngineIntervals(location) ) - continue; - - if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { - // we've move across some interval boundary, restart profile - profile = incorporateActiveRegions(profile, activeRegions); - } - - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - final ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); - - // Call the walkers isActive function for this locus and add them to the list to be integrated later - profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); - - prevLoc = location; - - printProgress(locus.getLocation()); - } - - updateCumulativeMetrics(dataProvider.getShard()); - - if ( ! profile.isEmpty() ) - incorporateActiveRegions(profile, activeRegions); - - // add active regions to queue of regions to process - // first check if can merge active regions over shard boundaries - if( !activeRegions.isEmpty() ) { - if( !workQueue.isEmpty() ) { - final ActiveRegion last = workQueue.getLast(); - final ActiveRegion first = activeRegions.get(0); - if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= getMaxRegionSize() ) { - workQueue.removeLast(); - activeRegions.remove(first); - workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), getActiveRegionExtension()) ); - } - } - workQueue.addAll( activeRegions ); - } - - logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); - - // now go and process all of the active regions - sum = processActiveRegions(walker, sum, false); - - return sum; - } - - private GenomeLoc startOfLiveRegion = null; - - protected void notifyOfCurrentPosition(final GATKSAMRecord read) { - notifyOfCurrentPosition(engine.getGenomeLocParser().createGenomeLoc(read)); - } - - protected void notifyOfCurrentPosition(final GenomeLoc currentLocation) { - if ( startOfLiveRegion == null ) - startOfLiveRegion = currentLocation; - else - startOfLiveRegion = startOfLiveRegion.max(currentLocation.getStartLocation()); - } - - protected GenomeLoc getStartOfLiveRegion() { - return startOfLiveRegion; - } - - protected boolean regionCompletelyWithinDeadZone(final GenomeLoc region, final boolean includeExtension) { - return (region.getStop() < (getStartOfLiveRegion().getStart() - (includeExtension ? getActiveRegionExtension() : 0))) - || ! region.onSameContig(getStartOfLiveRegion()); - } - - private T processActiveRegions(final ActiveRegionWalker walker, T sum, final boolean forceRegionsToBeActive) { - if( walker.activeRegionOutStream != null ) { - writeActiveRegionsToStream(walker); - return sum; - } else { - return callWalkerMapOnActiveRegions(walker, sum, forceRegionsToBeActive); - } - } - - private T callWalkerMapOnActiveRegions(final ActiveRegionWalker walker, T sum, final boolean forceRegionsToBeActive) { - // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them - // TODO can implement parallel traversal here - while( workQueue.peek() != null ) { - final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc(); - if ( forceRegionsToBeActive || regionCompletelyWithinDeadZone(extendedLoc, false) ) { - final ActiveRegion activeRegion = workQueue.remove(); - if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + getStartOfLiveRegion()); - sum = processActiveRegion( activeRegion, sum, walker ); - } else { - break; - } - } - - return sum; - } - - @Override - public String toString() { - return "TraverseActiveRegionsOptimized"; - } - - private boolean readIsDead(final GATKSAMRecord read, final GenomeLoc readLoc, final ActiveRegion activeRegion) { - return readLoc.getStop() < activeRegion.getLocation().getStart() && regionCompletelyWithinDeadZone(readLoc, true); - } - - @Override - protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker) { - final Iterator liveReads = myReads.iterator(); - while ( liveReads.hasNext() ) { - boolean killed = false; - final GATKSAMRecord read = liveReads.next(); - final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); - - if( activeRegion.getLocation().overlapsP( readLoc ) ) { - activeRegion.add(read); - - if ( ! walker.wantsNonPrimaryReads() ) { - if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion()); - liveReads.remove(); - killed = true; - } - } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) { - activeRegion.add( read ); - } - - if ( ! killed && readIsDead(read, readLoc, activeRegion) ) { - if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion()); - liveReads.remove(); - } - } - - logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); - final M x = walker.map(activeRegion, null); - return walker.reduce( x, sum ); - } - - - /** - * Special function called in LinearMicroScheduler to empty out the work queue. - * Ugly for now but will be cleaned up when we push this functionality more into the engine - */ - @Override - public T endTraversal(final Walker walker, T sum) { - return processActiveRegions((ActiveRegionWalker)walker, sum, true); - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginal.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginal.java deleted file mode 100644 index 0786bc800..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginal.java +++ /dev/null @@ -1,262 +0,0 @@ -/* - * Copyright (c) 2012 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.traversals; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.WalkerManager; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.*; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.activeregion.ActivityProfile; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: 12/9/11 - */ - -public class TraverseActiveRegionsOriginal extends TraverseActiveRegions { - private final LinkedHashSet myReads = new LinkedHashSet(); - - @Override - public T traverse( final ActiveRegionWalker walker, - final LocusShardDataProvider dataProvider, - T sum) { - logger.debug(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider)); - - final LocusView locusView = new AllLocusView(dataProvider); - - final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); - final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); - final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion(); - - int minStart = Integer.MAX_VALUE; - final List activeRegions = new LinkedList(); - ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); - - ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); - - // We keep processing while the next reference location is within the interval - GenomeLoc prevLoc = null; - while( locusView.hasNext() ) { - final AlignmentContext locus = locusView.next(); - final GenomeLoc location = locus.getLocation(); - - // Grab all the previously unseen reads from this pileup and add them to the massive read list - // Note that this must occur before we leave because we are outside the intervals because - // reads may occur outside our intervals but overlap them in the future - // TODO -- this whole HashSet logic should be changed to a linked list of reads with - // TODO -- subsequent pass over them to find the ones overlapping the active regions - for( final PileupElement p : locus.getBasePileup() ) { - final GATKSAMRecord read = p.getRead(); - if( !myReads.contains(read) ) { - myReads.add(read); - } - - // If this is the last pileup for this shard calculate the minimum alignment start so that we know - // which active regions in the work queue are now safe to process - minStart = Math.min(minStart, read.getAlignmentStart()); - } - - // skip this location -- it's not part of our engine intervals - if ( outsideEngineIntervals(location) ) - continue; - - if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { - // we've move across some interval boundary, restart profile - profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); - } - - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - final ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); - - // Call the walkers isActive function for this locus and add them to the list to be integrated later - profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); - - prevLoc = location; - - printProgress(locus.getLocation()); - } - - updateCumulativeMetrics(dataProvider.getShard()); - - if ( ! profile.isEmpty() ) - incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); - - // add active regions to queue of regions to process - // first check if can merge active regions over shard boundaries - if( !activeRegions.isEmpty() ) { - if( !workQueue.isEmpty() ) { - final ActiveRegion last = workQueue.getLast(); - final ActiveRegion first = activeRegions.get(0); - if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) { - workQueue.removeLast(); - activeRegions.remove(first); - workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) ); - } - } - workQueue.addAll( activeRegions ); - } - - logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); - - // now go and process all of the active regions - sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig()); - - return sum; - } - - /** - * Take the individual isActive calls and integrate them into contiguous active regions and - * add these blocks of work to the work queue - * band-pass filter the list of isActive probabilities and turn into active regions - * - * @param profile - * @param activeRegions - * @param activeRegionExtension - * @param maxRegionSize - * @return - */ - private ActivityProfile incorporateActiveRegions(final ActivityProfile profile, - final List activeRegions, - final int activeRegionExtension, - final int maxRegionSize) { - if ( profile.isEmpty() ) - throw new IllegalStateException("trying to incorporate an empty active profile " + profile); - - final ActivityProfile bandPassFiltered = profile.bandPassFilter(); - activeRegions.addAll(bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize )); - return new ActivityProfile( engine.getGenomeLocParser(), profile.hasPresetRegions() ); - } - - // -------------------------------------------------------------------------------- - // - // code to handle processing active regions - // - // -------------------------------------------------------------------------------- - - private T processActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { - if( walker.activeRegionOutStream != null ) { - writeActiveRegionsToStream(walker); - return sum; - } else { - return callWalkerMapOnActiveRegions(walker, sum, minStart, currentContig); - } - } - - private T callWalkerMapOnActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { - // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them - // TODO can implement parallel traversal here - while( workQueue.peek() != null ) { - final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc(); - if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) { - final ActiveRegion activeRegion = workQueue.remove(); - sum = processActiveRegion( activeRegion, sum, walker ); - } else { - break; - } - } - - return sum; - } - - @Override - protected T processActiveRegion( final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker ) { - final ArrayList placedReads = new ArrayList(); - for( final GATKSAMRecord read : myReads ) { - final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); - if( activeRegion.getLocation().overlapsP( readLoc ) ) { - // The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region) - long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc ); - ActiveRegion bestRegion = activeRegion; - for( final ActiveRegion otherRegionToTest : workQueue ) { - if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) { - maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc ); - bestRegion = otherRegionToTest; - } - } - bestRegion.add( read ); - - // The read is also added to all other regions in which it overlaps but marked as non-primary - if( walker.wantsNonPrimaryReads() ) { - if( !bestRegion.equals(activeRegion) ) { - activeRegion.add( read ); - } - for( final ActiveRegion otherRegionToTest : workQueue ) { - if( !bestRegion.equals(otherRegionToTest) ) { - // check for non-primary vs. extended - if ( otherRegionToTest.getLocation().overlapsP( readLoc ) ) { - otherRegionToTest.add( read ); - } else if ( walker.wantsExtendedReads() && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) { - otherRegionToTest.add( read ); - } - } - } - } - placedReads.add( read ); - // check for non-primary vs. extended - } else if( activeRegion.getLocation().overlapsP( readLoc ) ) { - if ( walker.wantsNonPrimaryReads() ) { - activeRegion.add( read ); - } - } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) { - activeRegion.add( read ); - } - } - myReads.removeAll( placedReads ); // remove all the reads which have been placed into their active region - // WARNING: This hashset relies on reads being exactly equal when they are placed in the list as when they are removed. So the ActiveRegionWalker can't modify the reads in any way. - - logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); - final M x = walker.map( activeRegion, null ); - return walker.reduce( x, sum ); - } - - /** - * Special function called in LinearMicroScheduler to empty out the work queue. - * Ugly for now but will be cleaned up when we push this functionality more into the engine - */ - public T endTraversal( final Walker walker, T sum) { - return processActiveRegions((ActiveRegionWalker) walker, sum, Integer.MAX_VALUE, null); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java index 573291d06..fe2eee2a2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java @@ -217,9 +217,9 @@ public class CoverageUtils { private static void updateCounts(int[] counts, PileupElement e) { if ( e.isDeletion() ) { - counts[BaseUtils.DELETION_INDEX] += e.getRepresentativeCount(); - } else if ( BaseUtils.basesAreEqual((byte) 'N', e.getBase()) ) { - counts[BaseUtils.NO_CALL_INDEX] += e.getRepresentativeCount(); + counts[BaseUtils.Base.D.ordinal()] += e.getRepresentativeCount(); + } else if ( BaseUtils.basesAreEqual(BaseUtils.Base.N.base, e.getBase()) ) { + counts[BaseUtils.Base.N.ordinal()] += e.getRepresentativeCount(); } else { try { counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())] += e.getRepresentativeCount(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java index 9cd1be2d9..668d3fd5f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java @@ -86,7 +86,7 @@ public class GCContentByInterval extends LocusWalker { if (tracker == null) return null; int baseIndex = ref.getBaseIndex(); - return (baseIndex == BaseUtils.gIndex || baseIndex == BaseUtils.cIndex) ? 1L : 0L; + return (baseIndex == BaseUtils.Base.G.ordinal() || baseIndex == BaseUtils.Base.C.ordinal()) ? 1L : 0L; } public Long reduce(Long toAdd, Long runningCount) { diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index efe9460cb..baab1f5fa 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -37,12 +37,10 @@ import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.Serializable; import java.util.*; -public class Haplotype { - protected final byte[] bases; - protected final double[] quals; +public class Haplotype extends Allele { + private GenomeLoc genomeLocation = null; private HashMap eventMap = null; - private boolean isRef = false; private Cigar cigar; private int alignmentStartHapwrtRef; public int leftBreakPoint = 0; @@ -50,44 +48,37 @@ public class Haplotype { private Event artificialEvent = null; /** - * Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual + * Main constructor * * @param bases bases - * @param qual qual + * @param isRef is reference allele? */ - public Haplotype( final byte[] bases, final int qual ) { - this.bases = bases.clone(); - quals = new double[bases.length]; - Arrays.fill(quals, (double)qual); - } - - public Haplotype( final byte[] bases, final double[] quals ) { - this.bases = bases.clone(); - this.quals = quals.clone(); + public Haplotype( final byte[] bases, final boolean isRef ) { + super(bases.clone(), isRef); } public Haplotype( final byte[] bases ) { - this(bases, 0); + this(bases, false); } protected Haplotype( final byte[] bases, final Event artificialEvent ) { - this(bases, 0); + this(bases, false); this.artificialEvent = artificialEvent; } public Haplotype( final byte[] bases, final GenomeLoc loc ) { - this(bases); + this(bases, false); this.genomeLocation = loc; } @Override public boolean equals( Object h ) { - return h instanceof Haplotype && Arrays.equals(bases, ((Haplotype) h).bases); + return h instanceof Haplotype && Arrays.equals(getBases(), ((Haplotype) h).getBases()); } - + @Override public int hashCode() { - return Arrays.hashCode(bases); + return Arrays.hashCode(getBases()); } public HashMap getEventMap() { @@ -98,32 +89,13 @@ public class Haplotype { this.eventMap = eventMap; } - public boolean isReference() { - return isRef; - } - - public void setIsReference( boolean isRef ) { - this.isRef = isRef; - } - - public double getQualitySum() { - double s = 0; - for (int k=0; k < bases.length; k++) { - s += quals[k]; - } - return s; - } - @Override public String toString() { - return new String(bases); + return getDisplayString(); } - public double[] getQuals() { - return quals.clone(); - } public byte[] getBases() { - return bases.clone(); + return super.getBases().clone(); } public long getStartPosition() { @@ -178,19 +150,23 @@ public class Haplotype { public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation, final int genomicInsertLocation ) { // refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates final int haplotypeInsertLocation = ReadUtils.getReadCoordinateForReferenceCoordinate(alignmentStartHapwrtRef, cigar, refInsertLocation, ReadUtils.ClippingTail.RIGHT_TAIL, true); - if( haplotypeInsertLocation == -1 || haplotypeInsertLocation + refAllele.length() >= bases.length ) { // desired change falls inside deletion so don't bother creating a new haplotype + if( haplotypeInsertLocation == -1 || haplotypeInsertLocation + refAllele.length() >= getBases().length ) { // desired change falls inside deletion so don't bother creating a new haplotype return null; } byte[] newHaplotypeBases = new byte[]{}; - newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, 0, haplotypeInsertLocation)); // bases before the variant + newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(getBases(), 0, haplotypeInsertLocation)); // bases before the variant newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, altAllele.getBases()); // the alt allele of the variant - newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, haplotypeInsertLocation + refAllele.length(), bases.length)); // bases after the variant + newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(getBases(), haplotypeInsertLocation + refAllele.length(), getBases().length)); // bases after the variant return new Haplotype(newHaplotypeBases, new Event(refAllele, altAllele, genomicInsertLocation)); } public static class HaplotypeBaseComparator implements Comparator, Serializable { @Override public int compare( final Haplotype hap1, final Haplotype hap2 ) { + return compareHaplotypeBases(hap1, hap2); + } + + public static int compareHaplotypeBases(final Haplotype hap1, final Haplotype hap2) { final byte[] arr1 = hap1.getBases(); final byte[] arr2 = hap2.getBases(); // compares byte arrays using lexical ordering diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java index 524c29d64..87526545d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java @@ -381,9 +381,9 @@ public class ReadClipper { * @return a new read without adaptor sequence */ private GATKSAMRecord hardClipAdaptorSequence () { - final Integer adaptorBoundary = ReadUtils.getAdaptorBoundary(read); + final int adaptorBoundary = ReadUtils.getAdaptorBoundary(read); - if (adaptorBoundary == null || !ReadUtils.isInsideRead(read, adaptorBoundary)) + if (adaptorBoundary == ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY || !ReadUtils.isInsideRead(read, adaptorBoundary)) return read; return read.getReadNegativeStrandFlag() ? hardClipByReferenceCoordinatesLeftTail(adaptorBoundary) : hardClipByReferenceCoordinatesRightTail(adaptorBoundary); diff --git a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java index 3d43d5d4d..a749625cd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java +++ b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java @@ -33,6 +33,7 @@ import net.sf.samtools.SAMSequenceRecord; import net.sf.samtools.util.StringUtil; import org.apache.log4j.Priority; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.variant.utils.BaseUtils; import java.io.File; import java.io.FileNotFoundException; @@ -41,9 +42,10 @@ import java.util.Arrays; /** * A caching version of the IndexedFastaSequenceFile that avoids going to disk as often as the raw indexer. * - * Thread-safe! Uses a thread-local cache + * Thread-safe! Uses a thread-local cache. * - * Automatically upper-cases the bases coming in, unless they the flag preserveCase is explicitly set + * Automatically upper-cases the bases coming in, unless the flag preserveCase is explicitly set. + * Automatically converts IUPAC bases to Ns, unless the flag preserveIUPAC is explicitly set. */ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(CachingIndexedFastaSequenceFile.class); @@ -64,10 +66,15 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { private final long cacheMissBackup; /** - * If true, we will preserve the case of the original base in the genome, not + * If true, we will preserve the case of the original base in the genome */ private final boolean preserveCase; + /** + * If true, we will preserve the IUPAC bases in the genome + */ + private final boolean preserveIUPAC; + // information about checking efficiency long cacheHits = 0; long cacheMisses = 0; @@ -97,13 +104,15 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { * @param index the index of the fasta file, used for efficient random access * @param cacheSize the size in bp of the cache we will use for this reader * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case + * @param preserveIUPAC If true, we will keep the IUPAC bases in the FASTA, otherwise they are converted to Ns */ - public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize, final boolean preserveCase) { + public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize, final boolean preserveCase, final boolean preserveIUPAC) { super(fasta, index); if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0"); this.cacheSize = cacheSize; this.cacheMissBackup = Math.max(cacheSize / 1000, 1); this.preserveCase = preserveCase; + this.preserveIUPAC = preserveIUPAC; } /** @@ -116,25 +125,15 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { * @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0 * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case */ - public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize, final boolean preserveCase ) throws FileNotFoundException { + public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize, final boolean preserveCase, final boolean preserveIUPAC) throws FileNotFoundException { super(fasta); if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0"); this.cacheSize = cacheSize; this.cacheMissBackup = Math.max(cacheSize / 1000, 1); this.preserveCase = preserveCase; + this.preserveIUPAC = preserveIUPAC; } -// /** -// * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. -// * -// * @param fasta The file to open. -// * @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk. -// * @throws java.io.FileNotFoundException If the fasta or any of its supporting files cannot be found. -// */ -// public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index) { -// this(fasta, index, DEFAULT_CACHE_SIZE); -// } - /** * Same as general constructor but allows one to override the default cacheSize * @@ -145,7 +144,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { * @param cacheSize the size in bp of the cache we will use for this reader */ public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize) { - this(fasta, index, cacheSize, false); + this(fasta, index, cacheSize, false, false); } /** @@ -169,7 +168,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case */ public CachingIndexedFastaSequenceFile(final File fasta, final boolean preserveCase) throws FileNotFoundException { - this(fasta, DEFAULT_CACHE_SIZE, preserveCase); + this(fasta, DEFAULT_CACHE_SIZE, preserveCase, false); } /** @@ -182,7 +181,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { * @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0 */ public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize ) throws FileNotFoundException { - this(fasta, cacheSize, false); + this(fasta, cacheSize, false, false); } /** @@ -240,6 +239,15 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { return ! isPreservingCase(); } + /** + * Is this CachingIndexedFastaReader keeping the IUPAC bases in the fasta, or is it turning them into Ns? + * + * @return true if the IUPAC bases coming from this reader are not modified + */ + public boolean isPreservingIUPAC() { + return preserveIUPAC; + } + /** * Gets the subsequence of the contig in the range [start,stop] * @@ -253,7 +261,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { * all of the bases in the ReferenceSequence returned by this method will be upper cased. */ @Override - public ReferenceSequence getSubsequenceAt( final String contig, final long start, final long stop ) { + public ReferenceSequence getSubsequenceAt( final String contig, long start, final long stop ) { final ReferenceSequence result; final Cache myCache = cache.get(); @@ -261,8 +269,9 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { cacheMisses++; result = super.getSubsequenceAt(contig, start, stop); if ( ! preserveCase ) StringUtil.toUpperCase(result.getBases()); + if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(result.getBases(), true, start < 1); } else { - // todo -- potential optimization is to check if contig.name == contig, as this in generally will be true + // todo -- potential optimization is to check if contig.name == contig, as this in general will be true SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig); if (stop > contigInfo.getSequenceLength()) @@ -276,6 +285,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { // convert all of the bases in the sequence to upper case if we aren't preserving cases if ( ! preserveCase ) StringUtil.toUpperCase(myCache.seq.getBases()); + if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(myCache.seq.getBases(), true, myCache.start == 0); } else { cacheHits++; } diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index c647a7b80..7374dda14 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -32,6 +32,7 @@ import net.sf.picard.util.IntervalList; import net.sf.samtools.SAMFileHeader; import org.apache.log4j.Logger; import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.IntervalArgumentCollection; import org.broadinstitute.sting.commandline.IntervalBinding; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.utils.GenomeLoc; @@ -534,6 +535,47 @@ public class IntervalUtils { } } + public static GenomeLocSortedSet parseIntervalArguments(final ReferenceDataSource referenceDataSource, IntervalArgumentCollection argCollection) { + GenomeLocSortedSet intervals = null; + + // return if no interval arguments at all + if ( argCollection.intervals == null && argCollection.excludeIntervals == null ) + return intervals; + + // Note that the use of '-L all' is no longer supported. + + // if include argument isn't given, create new set of all possible intervals + + final Pair includeExcludePair = IntervalUtils.parseIntervalBindingsPair( + referenceDataSource, + argCollection.intervals, + argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, + argCollection.excludeIntervals); + + final GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst(); + final GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond(); + + // if no exclude arguments, can return parseIntervalArguments directly + if ( excludeSortedSet == null ) + intervals = includeSortedSet; + + // otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets + else { + intervals = includeSortedSet.subtractRegions(excludeSortedSet); + + // logging messages only printed when exclude (-XL) arguments are given + final long toPruneSize = includeSortedSet.coveredSize(); + final long toExcludeSize = excludeSortedSet.coveredSize(); + final long intervalSize = intervals.coveredSize(); + logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize)); + logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)", + toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize))); + } + + logger.info(String.format("Processing %d bp from intervals", intervals.coveredSize())); + return intervals; + } + public static Pair parseIntervalBindingsPair( final ReferenceDataSource referenceDataSource, final List> intervals, diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java index 32e56866b..50bc9e25b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java @@ -113,6 +113,16 @@ public class AlignmentStateMachine { return read; } + /** + * Get the reference index of the underlying read + * + * @return the reference index of the read + */ + @Ensures("result == getRead().getReferenceIndex()") + public int getReferenceIndex() { + return getRead().getReferenceIndex(); + } + /** * Is this the left edge state? I.e., one that is before or after the current read? * @return true if this state is an edge state, false otherwise diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java index 0985ed196..2d074f420 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java @@ -63,6 +63,8 @@ public class LIBSPerformance extends CommandLineProgram { @Argument(fullName = "L", shortName = "L", doc = "Query location", required = false) public String location = null; + @Argument(fullName = "dt", shortName = "dt", doc = "Enable downsampling", required = false) + public boolean downsample = false; @Override public int execute() throws IOException { @@ -86,7 +88,7 @@ public class LIBSPerformance extends CommandLineProgram { for ( final SAMReadGroupRecord rg : reader.getFileHeader().getReadGroups() ) samples.add(rg.getSample()); - final LIBSDownsamplingInfo ds = new LIBSDownsamplingInfo(false, -1); + final LIBSDownsamplingInfo ds = new LIBSDownsamplingInfo(downsample, 250); final LocusIteratorByState libs = new LocusIteratorByState( diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java index 01c9e564e..e7b75f1f2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java @@ -34,8 +34,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.pileup.*; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -66,7 +65,7 @@ import java.util.*; * occurs, if requested. This allows users of LIBS to see both a ReadBackedPileup view of the data as well as * a stream of unique, sorted reads */ -public class LocusIteratorByState extends LocusIterator { +public final class LocusIteratorByState extends LocusIterator { /** * our log, which we want to capture anything from this class */ @@ -234,17 +233,16 @@ public class LocusIteratorByState extends LocusIterator { final GenomeLoc location = getLocation(); final Map fullPileup = new HashMap(); - // TODO: How can you determine here whether the current pileup has been downsampled? - boolean hasBeenSampled = false; - - for (final String sample : samples) { - final Iterator iterator = readStates.iterator(sample); - final List pile = new ArrayList(readStates.size(sample)); + for (final Map.Entry sampleStatePair : readStates ) { + final String sample = sampleStatePair.getKey(); + final PerSampleReadStateManager readState = sampleStatePair.getValue(); + final Iterator iterator = readState.iterator(); + final List pile = new ArrayList(readState.size()); while (iterator.hasNext()) { // state object with the read/offset information final AlignmentStateMachine state = iterator.next(); - final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); + final GATKSAMRecord read = state.getRead(); final CigarOperator op = state.getCigarOperator(); if (op == CigarOperator.N) // N's are never added to any pileup @@ -263,29 +261,9 @@ public class LocusIteratorByState extends LocusIterator { fullPileup.put(sample, new ReadBackedPileupImpl(location, pile)); } - updateReadStates(); // critical - must be called after we get the current state offsets and location + readStates.updateReadStates(); // critical - must be called after we get the current state offsets and location if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done - nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled); - } - } - - /** - * Advances all fo the read states by one bp. After this call the read states are reflective - * of the next pileup. - */ - private void updateReadStates() { - for (final String sample : samples) { - Iterator it = readStates.iterator(sample); - while (it.hasNext()) { - AlignmentStateMachine state = it.next(); - CigarOperator op = state.stepForwardOnGenome(); - if (op == null) { - // we discard the read only when we are past its end AND indel at the end of the read (if any) was - // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe - // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag. - it.remove(); // we've stepped off the end of the object - } - } + nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), false); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java new file mode 100644 index 000000000..3f3bc706f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java @@ -0,0 +1,261 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.locusiterator; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; +import net.sf.samtools.CigarOperator; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.downsampling.Downsampler; +import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler; + +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; + +/** + * ReadStateManager for a single sample + * + * User: depristo + * Date: 1/13/13 + * Time: 12:28 PM + */ +@Invariant({ + "readStartsAreWellOrdered()", + "! isDownsampling() || downsamplingTarget > 0", + "nSites >= 0", + "nSitesNeedingDownsampling >= 0", + "nSitesNeedingDownsampling <= nSites" +}) +final class PerSampleReadStateManager implements Iterable { + private final static Logger logger = Logger.getLogger(ReadStateManager.class); + private final static boolean CAPTURE_DOWNSAMPLING_STATS = false; + + /** + * A list (potentially empty) of alignment state machines. + * + * The state machines must be ordered by the alignment start of their underlying reads, with the + * lowest alignment starts on the left, and the largest on the right + */ + private LinkedList readStatesByAlignmentStart = new LinkedList(); + + private final Downsampler> levelingDownsampler; + private final int downsamplingTarget; + + /** + * The number of sites where downsampling has been invoked + */ + private int nSitesNeedingDownsampling = 0; + + /** + * The number of sites we've visited + */ + private int nSites = 0; + + /** + * Create a new PerSampleReadStateManager with downsampling parameters as requested by LIBSDownsamplingInfo + * @param LIBSDownsamplingInfo the downsampling params we want to use + */ + public PerSampleReadStateManager(final LIBSDownsamplingInfo LIBSDownsamplingInfo) { + this.downsamplingTarget = LIBSDownsamplingInfo.isPerformDownsampling() ? LIBSDownsamplingInfo.getToCoverage() : -1; + this.levelingDownsampler = LIBSDownsamplingInfo.isPerformDownsampling() + ? new LevelingDownsampler, AlignmentStateMachine>(LIBSDownsamplingInfo.getToCoverage()) + : null; + } + + /** + * Group the underlying readStatesByAlignmentStart into a list of list of alignment state machines, + * where each list contains machines with a unique genome site. The outer list is ordered + * by alignment start. + * + * For example, if the flat list has alignment starts [10, 10, 11, 12, 12, 13] then + * the resulting grouping will be [[10, 10], [11], [12, 12], [13]]. + * + * @return a non-null list of lists + */ + @Ensures("result != null") + private List> groupByAlignmentStart() { + final LinkedList> grouped = new LinkedList>(); + + AlignmentStateMachine last = null; + for ( final AlignmentStateMachine stateMachine : readStatesByAlignmentStart ) { + if ( last == null || stateMachine.getGenomeOffset() != last.getGenomeOffset() ) { + // we've advanced to a place where the state machine has a different state, + // so start a new list + grouped.add(new LinkedList()); + last = stateMachine; + } + grouped.getLast().add(stateMachine); + } + + return grouped; + } + + /** + * Flattens the grouped list of list of alignment state machines into a single list in order + * @return a non-null list contains the state machines + */ + @Ensures("result != null") + private LinkedList flattenByAlignmentStart(final List> grouped) { + final LinkedList flat = new LinkedList(); + for ( final List l : grouped ) + flat.addAll(l); + return flat; + } + + /** + * Test that the reads are ordered by their alignment starts + * @return true if well ordered, false otherwise + */ + private boolean readStartsAreWellOrdered() { + int lastStart = -1; + for ( final AlignmentStateMachine machine : readStatesByAlignmentStart ) { + if ( lastStart > machine.getRead().getAlignmentStart() ) + return false; + lastStart = machine.getRead().getAlignmentStart(); + } + return true; + } + + /** + * Assumes it can just keep the states linked lists without making a copy + * @param states the new states to add to this manager + * @return The change in the number of states, after including states and potentially downsampling. Note + * that this return result might be negative, if downsampling is enabled, as we might drop + * more sites than have been added by the downsampler + */ + @Requires("states != null") + public int addStatesAtNextAlignmentStart(final LinkedList states) { + if ( states.isEmpty() ) { + return 0; + } + + readStatesByAlignmentStart.addAll(states); + int nStatesAdded = states.size(); + + if ( isDownsampling() && readStatesByAlignmentStart.size() > downsamplingTarget ) { + // only go into the downsampling branch if we are downsampling and the coverage > the target + captureDownsamplingStats(); + levelingDownsampler.submit(groupByAlignmentStart()); + levelingDownsampler.signalEndOfInput(); + + nStatesAdded -= levelingDownsampler.getNumberOfDiscardedItems(); + + // use returned List directly rather than make a copy, for efficiency's sake + readStatesByAlignmentStart = flattenByAlignmentStart(levelingDownsampler.consumeFinalizedItems()); + levelingDownsampler.reset(); + } + + return nStatesAdded; + } + + /** + * Is downsampling enabled for this manager? + * @return true if we are downsampling, false otherwise + */ + private boolean isDownsampling() { + return levelingDownsampler != null; + } + + /** + * Get the leftmost alignment state machine, or null if the read states is empty + * @return a potentially null AlignmentStateMachine + */ + public AlignmentStateMachine getFirst() { + return isEmpty() ? null : readStatesByAlignmentStart.getFirst(); + } + + /** + * Capture some statistics about the behavior of the downsampling, but only if CAPTURE_DOWNSAMPLING_STATS is true + */ + @Requires("isDownsampling()") + private void captureDownsamplingStats() { + if ( CAPTURE_DOWNSAMPLING_STATS ) { + nSites++; + final int loc = getFirst().getGenomePosition(); + String message = "Pass through"; + final boolean downsampling = size() > downsamplingTarget; + if ( downsampling ) { + nSitesNeedingDownsampling++; + message = "Downsampling"; + } + + if ( downsampling || nSites % 10000 == 0 ) + logger.info(String.format("%20s at %s: coverage=%d, max=%d, fraction of downsampled sites=%.2e", + message, loc, size(), downsamplingTarget, (1.0 * nSitesNeedingDownsampling / nSites))); + } + } + + /** + * Is there at least one alignment for this sample in this manager? + * @return true if there's at least one alignment, false otherwise + */ + public boolean isEmpty() { + return readStatesByAlignmentStart.isEmpty(); + } + + /** + * Get the number of read states currently in this manager + * @return the number of read states + */ + @Ensures("result >= 0") + public int size() { + return readStatesByAlignmentStart.size(); + } + + /** + * Advances all read states forward by one element, removing states that are + * no long aligned to the current position. + * @return the number of states we're removed after advancing + */ + public int updateReadStates() { + int nRemoved = 0; + final Iterator it = iterator(); + while (it.hasNext()) { + final AlignmentStateMachine state = it.next(); + final CigarOperator op = state.stepForwardOnGenome(); + if (op == null) { + // we discard the read only when we are past its end AND indel at the end of the read (if any) was + // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe + // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag. + it.remove(); // we've stepped off the end of the object + nRemoved++; + } + } + + return nRemoved; + } + + /** + * Iterate over the AlignmentStateMachine in this manager in alignment start order. + * @return a valid iterator + */ + @Ensures("result != null") + public Iterator iterator() { + return readStatesByAlignmentStart.iterator(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java index 2dcf01d72..09ec3b264 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java @@ -28,8 +28,7 @@ package org.broadinstitute.sting.utils.locusiterator; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.picard.util.PeekableIterator; -import org.broadinstitute.sting.gatk.downsampling.Downsampler; -import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler; +import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.*; @@ -48,11 +47,18 @@ import java.util.*; * Date: 1/5/13 * Time: 2:02 PM */ -class ReadStateManager { +final class ReadStateManager implements Iterable> { private final List samples; private final PeekableIterator iterator; private final SamplePartitioner samplePartitioner; - private final Map readStatesBySample = new HashMap(); + + /** + * A mapping from sample name -> the per sample read state manager that manages + * + * IT IS CRITICAL THAT THIS BE A LINKED HASH MAP, SO THAT THE ITERATION OF THE MAP OCCURS IN THE SAME + * ORDER AS THE ORIGINL SAMPLES + */ + private final Map readStatesBySample = new LinkedHashMap(); private LinkedList submittedReads; private final boolean keepSubmittedReads; @@ -70,6 +76,7 @@ class ReadStateManager { this.submittedReads = new LinkedList(); for (final String sample : samples) { + // because this is a linked hash map the order of iteration will be in sample order readStatesBySample.put(sample, new PerSampleReadStateManager(LIBSDownsamplingInfo)); } @@ -77,29 +84,16 @@ class ReadStateManager { } /** - * Returns a iterator over all the reads associated with the given sample. Note that remove() is implemented - * for this iterator; if present, total read states will be decremented. + * Returns a iterator over all the sample -> per-sample read state managers with each sample in this read state manager. * - * @param sample The sample. - * @return Iterator over the reads associated with that sample. + * The order of iteration is the same as the order of the samples provided upon construction to this + * ReadStateManager. + * + * @return Iterator over sample + per sample read state manager pairs for this read state manager. */ - public Iterator iterator(final String sample) { - // TODO -- why is this wrapped? - return new Iterator() { - private Iterator wrappedIterator = readStatesBySample.get(sample).iterator(); - - public boolean hasNext() { - return wrappedIterator.hasNext(); - } - - public AlignmentStateMachine next() { - return wrappedIterator.next(); - } - - public void remove() { - wrappedIterator.remove(); - } - }; + @Override + public Iterator> iterator() { + return readStatesBySample.entrySet().iterator(); } public boolean isEmpty() { @@ -126,10 +120,9 @@ class ReadStateManager { } public AlignmentStateMachine getFirst() { - for (final String sample : samples) { - PerSampleReadStateManager reads = readStatesBySample.get(sample); - if (!reads.isEmpty()) - return reads.peek(); + for ( final PerSampleReadStateManager manager : readStatesBySample.values() ) { + if ( ! manager.isEmpty() ) + return manager.getFirst(); } return null; } @@ -138,55 +131,69 @@ class ReadStateManager { return totalReadStates > 0 || iterator.hasNext(); } - // fast testing of position - /** - * TODO -- this function needs to be optimized - * - * Notes: - * -- the only place where it's called is in a block where we know isEmpty is false - * -- getFirst() is quite expensive, and it seems that we could cache this value in the outer - * block, and then pass this in as an argument - * - * @param read - * @return + * Advances all fo the read states by one bp. After this call the read states are reflective + * of the next pileup. */ - private boolean readIsPastCurrentPosition(GATKSAMRecord read) { - if (isEmpty()) - return false; - else { - final AlignmentStateMachine state = getFirst(); - final GATKSAMRecord ourRead = state.getRead(); - return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() > state.getGenomePosition(); + public void updateReadStates() { + for (final PerSampleReadStateManager perSampleReadStateManager : readStatesBySample.values() ) { + totalReadStates -= perSampleReadStateManager.updateReadStates(); } } + /** + * Does read start at the same position as described by currentContextIndex and currentAlignmentStart? + * + * @param read the read we want to test + * @param currentContigIndex the contig index (from the read's getReferenceIndex) of the reads in this state manager + * @param currentAlignmentStart the alignment start of the of the left-most position on the + * genome of the reads in this read state manager + * @return true if read has contig index and start equal to the current ones + */ + private boolean readStartsAtCurrentPosition(final GATKSAMRecord read, final int currentContigIndex, final int currentAlignmentStart) { + return read.getAlignmentStart() == currentAlignmentStart && read.getReferenceIndex() == currentContigIndex; + } + + /** + * Pull all of the reads off the iterator that overlap the left-most position among all + * reads this ReadStateManager + */ public void collectPendingReads() { if (!iterator.hasNext()) return; - // the next record in the stream, peeked as to not remove it from the stream + // determine the left-most boundary that determines which reads to keep in this new pileup + final int firstContigIndex; + final int firstAlignmentStart; if ( isEmpty() ) { - final int firstContigIndex = iterator.peek().getReferenceIndex(); - final int firstAlignmentStart = iterator.peek().getAlignmentStart(); - while (iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) { - submitRead(iterator.next()); - } + // there are no reads here, so our next state is the next read in the stream + firstContigIndex = iterator.peek().getReferenceIndex(); + firstAlignmentStart = iterator.peek().getAlignmentStart(); } else { - // Fast fail in the case that the read is past the current position. - if (readIsPastCurrentPosition(iterator.peek())) - return; + // there's a read in the system, so it's our targeted first read + final AlignmentStateMachine firstState = getFirst(); + firstContigIndex = firstState.getReferenceIndex(); + // note this isn't the alignment start of the read, but rather the alignment start position + firstAlignmentStart = firstState.getGenomePosition(); + } - while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) { - submitRead(iterator.next()); - } + while ( iterator.hasNext() && readStartsAtCurrentPosition(iterator.peek(), firstContigIndex, firstAlignmentStart) ) { + submitRead(iterator.next()); } samplePartitioner.doneSubmittingReads(); for (final String sample : samples) { final Collection newReads = samplePartitioner.getReadsForSample(sample); - PerSampleReadStateManager statesBySample = readStatesBySample.get(sample); + + // if we're keeping reads, take the (potentially downsampled) list of new reads for this sample + // and add to the list of reads. Note this may reorder the list of reads someone (it groups them + // by sample, but it cannot change their absolute position on the genome as they all must + // start at the current location + if ( keepSubmittedReads ) + submittedReads.addAll(newReads); + + final PerSampleReadStateManager statesBySample = readStatesBySample.get(sample); addReadsToSample(statesBySample, newReads); } @@ -199,8 +206,6 @@ class ReadStateManager { */ @Requires("read != null") protected void submitRead(final GATKSAMRecord read) { - if ( keepSubmittedReads ) - submittedReads.add(read); samplePartitioner.submitRead(read); } @@ -271,94 +276,15 @@ class ReadStateManager { if (reads.isEmpty()) return; - Collection newReadStates = new LinkedList(); + final LinkedList newReadStates = new LinkedList(); - for (GATKSAMRecord read : reads) { - AlignmentStateMachine state = new AlignmentStateMachine(read); - if ( state.stepForwardOnGenome() != null ) + for (final GATKSAMRecord read : reads) { + final AlignmentStateMachine state = new AlignmentStateMachine(read); + if ( state.stepForwardOnGenome() != null ) // todo -- should be an assertion not a skip // explicitly filter out reads that are all insertions / soft clips newReadStates.add(state); } - readStates.addStatesAtNextAlignmentStart(newReadStates); - } - - protected class PerSampleReadStateManager implements Iterable { - private List> readStatesByAlignmentStart = new LinkedList>(); - private final Downsampler> levelingDownsampler; - - private int thisSampleReadStates = 0; - - public PerSampleReadStateManager(final LIBSDownsamplingInfo LIBSDownsamplingInfo) { - this.levelingDownsampler = LIBSDownsamplingInfo.isPerformDownsampling() - ? new LevelingDownsampler, AlignmentStateMachine>(LIBSDownsamplingInfo.getToCoverage()) - : null; - } - - public void addStatesAtNextAlignmentStart(Collection states) { - if ( states.isEmpty() ) { - return; - } - - readStatesByAlignmentStart.add(new LinkedList(states)); - thisSampleReadStates += states.size(); - totalReadStates += states.size(); - - if ( levelingDownsampler != null ) { - levelingDownsampler.submit(readStatesByAlignmentStart); - levelingDownsampler.signalEndOfInput(); - - thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); - totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); - - // use returned List directly rather than make a copy, for efficiency's sake - readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems(); - levelingDownsampler.reset(); - } - } - - public boolean isEmpty() { - return readStatesByAlignmentStart.isEmpty(); - } - - public AlignmentStateMachine peek() { - return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek(); - } - - public int size() { - return thisSampleReadStates; - } - - public Iterator iterator() { - return new Iterator() { - private Iterator> alignmentStartIterator = readStatesByAlignmentStart.iterator(); - private LinkedList currentPositionReadStates = null; - private Iterator currentPositionReadStatesIterator = null; - - public boolean hasNext() { - return alignmentStartIterator.hasNext() || - (currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext()); - } - - public AlignmentStateMachine next() { - if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) { - currentPositionReadStates = alignmentStartIterator.next(); - currentPositionReadStatesIterator = currentPositionReadStates.iterator(); - } - - return currentPositionReadStatesIterator.next(); - } - - public void remove() { - currentPositionReadStatesIterator.remove(); - thisSampleReadStates--; - totalReadStates--; - - if ( currentPositionReadStates.isEmpty() ) { - alignmentStartIterator.remove(); - } - } - }; - } + totalReadStates += readStates.addStatesAtNextAlignmentStart(newReadStates); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java index 1653c6a92..9bb474e4d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java @@ -25,6 +25,8 @@ package org.broadinstitute.sting.utils.locusiterator; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.downsampling.Downsampler; import org.broadinstitute.sting.gatk.downsampling.PassThroughDownsampler; @@ -33,49 +35,138 @@ import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler; import java.util.*; /** - * Divides reads by sample and (if requested) does a preliminary downsampling pass with a ReservoirDownsampler. + * Divides reads by sample and (if requested) does a preliminary downsampling pass + * with a ReservoirDownsampler. * * Note: stores reads by sample ID string, not by sample object */ class SamplePartitioner { - private Map> readsBySample; + /** + * Map from sample name (as a string) to a downsampler of reads for that sample + */ + final private Map> readsBySample; + /** + * Are we in a state where we're done submitting reads and have semi-finalized the + * underlying per sample downsampler? + */ + boolean doneSubmittingReads = false; + + /** + * Create a new SamplePartitioner capable of splitting reads up into buckets of reads for + * each sample in samples, and perform a preliminary downsampling of these reads + * (separately for each sample) if downsampling is requested in LIBSDownsamplingInfo + * + * Note that samples must be comprehensive, in that all reads every submitted to this + * partitioner must come from one of the samples provided here. If not, submitRead + * will throw an exception. Duplicates in the list of samples will be ignored + * + * @param LIBSDownsamplingInfo do we want to downsample, and if so to what coverage? + * @param samples the complete list of samples we're going to partition reads into. Can be + * empty, but in that case this code cannot function properly if you + * attempt to add data to it. + */ + @Ensures({ + "readsBySample != null", + "readsBySample.size() == new HashSet(samples).size()" + }) public SamplePartitioner(final LIBSDownsamplingInfo LIBSDownsamplingInfo, final List samples) { - readsBySample = new HashMap>(samples.size()); - for ( String sample : samples ) { + if ( LIBSDownsamplingInfo == null ) throw new IllegalArgumentException("LIBSDownsamplingInfo cannot be null"); + if ( samples == null ) throw new IllegalArgumentException("samples must be a non-null list"); + + readsBySample = new LinkedHashMap>(samples.size()); + for ( final String sample : samples ) { readsBySample.put(sample, createDownsampler(LIBSDownsamplingInfo)); } } + /** + * Create a new, ready to use downsampler based on the parameters in LIBSDownsamplingInfo + * @param LIBSDownsamplingInfo the parameters to use in creating the downsampler + * @return a downsampler appropriate for LIBSDownsamplingInfo. If no downsampling is requested, + * uses the PassThroughDownsampler, which does nothing at all. + */ + @Requires("LIBSDownsamplingInfo != null") + @Ensures("result != null") private Downsampler createDownsampler(final LIBSDownsamplingInfo LIBSDownsamplingInfo) { return LIBSDownsamplingInfo.isPerformDownsampling() - ? new ReservoirDownsampler(LIBSDownsamplingInfo.getToCoverage()) + ? new ReservoirDownsampler(LIBSDownsamplingInfo.getToCoverage(), true) : new PassThroughDownsampler(); } - public void submitRead(T read) { - String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; - if (readsBySample.containsKey(sampleName)) - readsBySample.get(sampleName).submit(read); + /** + * Offer this read to the partitioner, putting it into the bucket of reads for the sample + * of read (obtained via the read's read group). + * + * If the read group is missing, uses the special "null" read group + * + * @throws IllegalStateException if the sample of read wasn't present in the original + * set of samples provided to this SamplePartitioner at construction + * + * @param read the read to add to the sample's list of reads + */ + @Requires("read != null") + @Ensures("doneSubmittingReads == false") + public void submitRead(final T read) { + final String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + final Downsampler downsampler = readsBySample.get(sampleName); + if ( downsampler == null ) + throw new IllegalStateException("Offered read with sample name " + sampleName + " to SamplePartitioner " + + "but this sample wasn't provided as one of possible samples at construction"); + + downsampler.submit(read); + doneSubmittingReads = false; } + /** + * Tell this partitioner that all reads in this cycle have been submitted, so that we + * can finalize whatever downsampling is required by each sample. + * + * Note that we *must* call this function before getReadsForSample, or else that + * function will exception out. + */ + @Ensures("doneSubmittingReads == true") public void doneSubmittingReads() { - for ( Map.Entry> perSampleReads : readsBySample.entrySet() ) { - perSampleReads.getValue().signalEndOfInput(); + for ( final Downsampler downsampler : readsBySample.values() ) { + downsampler.signalEndOfInput(); } + doneSubmittingReads = true; } - public Collection getReadsForSample(String sampleName) { - if ( ! readsBySample.containsKey(sampleName) ) - throw new NoSuchElementException("Sample name not found"); + /** + * Get the final collection of reads for this sample for this cycle + * + * The cycle is defined as all of the reads that occur between + * the first call to submitRead until doneSubmittingReads is called. At that + * point additional downsampling may occur (depending on construction arguments) + * and that set of reads is returned here. + * + * Note that this function can only be called once per cycle, as underlying + * collection of reads is cleared. + * + * @param sampleName the sample we want reads for, must be present in the original samples + * @return a non-null collection of reads for sample in this cycle + */ + @Ensures("result != null") + public Collection getReadsForSample(final String sampleName) { + if ( ! doneSubmittingReads ) throw new IllegalStateException("getReadsForSample called before doneSubmittingReads was called"); - return readsBySample.get(sampleName).consumeFinalizedItems(); + final Downsampler downsampler = readsBySample.get(sampleName); + if ( downsampler == null ) throw new NoSuchElementException("Sample name not found"); + + return downsampler.consumeFinalizedItems(); } + /** + * Resets this SamplePartitioner, indicating that we're starting a new + * cycle of adding reads to each underlying downsampler. + */ + @Ensures("doneSubmittingReads == false") public void reset() { - for ( Map.Entry> perSampleReads : readsBySample.entrySet() ) { - perSampleReads.getValue().clear(); - perSampleReads.getValue().reset(); + for ( final Downsampler downsampler : readsBySample.values() ) { + downsampler.clear(); + downsampler.reset(); } + doneSubmittingReads = false; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index c0e18f227..5a5358208 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -31,7 +31,6 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.broadinstitute.variant.utils.BaseUtils; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -52,7 +51,7 @@ public class PileupElement implements Comparable { private final static EnumSet ON_GENOME_OPERATORS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X, CigarOperator.D); - public static final byte DELETION_BASE = BaseUtils.D; + public static final byte DELETION_BASE = BaseUtils.Base.D.base; public static final byte DELETION_QUAL = (byte) 16; public static final byte A_FOLLOWED_BY_INSERTION_BASE = (byte) 87; public static final byte C_FOLLOWED_BY_INSERTION_BASE = (byte) 88; diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index 0907a0239..b7a813ec2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -31,7 +31,6 @@ import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.variant.utils.BaseUtils; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -402,13 +401,13 @@ public class AlignmentUtils { switch (ce.getOperator()) { case I: if (alignPos > 0) { - if (alignment[alignPos - 1] == BaseUtils.A) { + if (alignment[alignPos - 1] == BaseUtils.Base.A.base) { alignment[alignPos - 1] = PileupElement.A_FOLLOWED_BY_INSERTION_BASE; - } else if (alignment[alignPos - 1] == BaseUtils.C) { + } else if (alignment[alignPos - 1] == BaseUtils.Base.C.base) { alignment[alignPos - 1] = PileupElement.C_FOLLOWED_BY_INSERTION_BASE; - } else if (alignment[alignPos - 1] == BaseUtils.T) { + } else if (alignment[alignPos - 1] == BaseUtils.Base.T.base) { alignment[alignPos - 1] = PileupElement.T_FOLLOWED_BY_INSERTION_BASE; - } else if (alignment[alignPos - 1] == BaseUtils.G) { + } else if (alignment[alignPos - 1] == BaseUtils.Base.G.base) { alignment[alignPos - 1] = PileupElement.G_FOLLOWED_BY_INSERTION_BASE; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java new file mode 100644 index 000000000..ab539c9dc --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.*; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.NGSPlatform; + +import java.io.File; +import java.io.IOException; +import java.util.*; + +/** + * Easy to use creator of artificial BAM files for testing + * + * Allows us to make a stream of reads or an index BAM file with read having the following properties + * + * - coming from n samples + * - of fixed read length and aligned to the genome with M operator + * - having N reads per alignment start + * - skipping N bases between each alignment start + * - starting at a given alignment start + * + * User: depristo + * Date: 1/15/13 + * Time: 9:22 AM + */ +public class ArtificialBAMBuilder { + public final static int BAM_SHARD_SIZE = 16384; + + private final IndexedFastaSequenceFile reference; + private final GenomeLocParser parser; + + final int nReadsPerLocus; + final int nLoci; + + int skipNLoci = 0; + int alignmentStart = 1; + int readLength = 10; + private final ArrayList samples = new ArrayList(); + + private LinkedList additionalReads = new LinkedList(); + + final SAMFileWriterFactory factory = new SAMFileWriterFactory(); + { + factory.setCreateIndex(true); + } + + SAMFileHeader header; + + public ArtificialBAMBuilder(final IndexedFastaSequenceFile reference, int nReadsPerLocus, int nLoci) { + this.nReadsPerLocus = nReadsPerLocus; + this.nLoci = nLoci; + + this.reference = reference; + this.parser = new GenomeLocParser(reference); + createAndSetHeader(1); + } + + public ArtificialBAMBuilder(int nReadsPerLocus, int nLoci) { + this(ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000).getSequenceDictionary(), nReadsPerLocus, nLoci); + } + + public ArtificialBAMBuilder(final SAMSequenceDictionary dict, int nReadsPerLocus, int nLoci) { + this.nReadsPerLocus = nReadsPerLocus; + this.nLoci = nLoci; + this.reference = null; + this.parser = new GenomeLocParser(dict); + createAndSetHeader(1); + } + + public IndexedFastaSequenceFile getReference() { + return reference; + } + + public GenomeLocParser getGenomeLocParser() { + return parser; + } + + public ArtificialBAMBuilder createAndSetHeader(final int nSamples) { + this.header = new SAMFileHeader(); + header.setSortOrder(SAMFileHeader.SortOrder.coordinate); + header.setSequenceDictionary(parser.getContigs()); + samples.clear(); + + for ( int i = 0; i < nSamples; i++ ) { + final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("rg" + i); + final String sample = "sample" + i; + samples.add(sample); + rg.setSample(sample); + rg.setPlatform(NGSPlatform.ILLUMINA.getDefaultPlatform()); + header.addReadGroup(rg); + } + + return this; + } + + public void addReads(final GATKSAMRecord readToAdd) { + additionalReads.add(readToAdd); + } + + public void addReads(final Collection readsToAdd) { + additionalReads.addAll(readsToAdd); + } + + public List getSamples() { + return samples; + } + + /** + * Create a read stream based on the parameters. The cigar string for each + * read will be *M, where * is the length of the read. + * + * Useful for testing things like LocusIteratorBystate + * + * @return a ordered list of reads + */ + public List makeReads() { + final String baseName = "read"; + List reads = new ArrayList(nReadsPerLocus*nLoci); + for ( int locusI = 0; locusI < nLoci; locusI++) { + final int locus = locusI * (skipNLoci + 1); + for ( int readI = 0; readI < nReadsPerLocus; readI++ ) { + for ( final SAMReadGroupRecord rg : header.getReadGroups() ) { + final String readName = String.format("%s.%d.%d.%s", baseName, locus, readI, rg.getId()); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, readName, 0, alignmentStart + locus, readLength); + read.setReadGroup(new GATKSAMReadGroupRecord(rg)); + reads.add(read); + } + } + } + + if ( ! additionalReads.isEmpty() ) { + reads.addAll(additionalReads); + Collections.sort(reads, new SAMRecordCoordinateComparator()); + } + + return reads; + } + + /** + * Make an indexed BAM file contains the reads in the builder, marking it for deleteOnExit() + * @return the BAM file + */ + public File makeTemporarilyBAMFile() { + try { + final File file = File.createTempFile("tempBAM", ".bam"); + file.deleteOnExit(); + return makeBAMFile(file); + } catch ( IOException e ) { + throw new RuntimeException(e); + } + } + + /** + * Write the reads from this builder to output, creating an index as well + * @param output the output BAM file we want to use + * @return + */ + public File makeBAMFile(final File output) { + final SAMFileWriter writer = factory.makeBAMWriter(header, true, output, 0); + for ( final GATKSAMRecord read : makeReads() ) + writer.addAlignment(read); + writer.close(); + return output; + } + + public int getnReadsPerLocus() { return nReadsPerLocus; } + public int getnLoci() { return nLoci; } + public int getSkipNLoci() { return skipNLoci; } + public ArtificialBAMBuilder setSkipNLoci(int skipNLoci) { this.skipNLoci = skipNLoci; return this; } + public int getAlignmentStart() { return alignmentStart; } + public ArtificialBAMBuilder setAlignmentStart(int alignmentStart) { this.alignmentStart = alignmentStart; return this; } + public int getReadLength() { return readLength; } + public ArtificialBAMBuilder setReadLength(int readLength) { this.readLength = readLength; return this; } + public SAMFileHeader getHeader() { return header; } + public ArtificialBAMBuilder setHeader(SAMFileHeader header) { this.header = header; return this; } + + public int getAlignmentEnd() { + return alignmentStart + nLoci * (skipNLoci + 1) + readLength; + } + + + public int getNSamples() { return samples.size(); } + + public int expectedNumberOfReads() { + return nLoci * nReadsPerLocus * header.getReadGroups().size(); + } + + @Override + public String toString() { + return "ArtificialBAMBuilder{" + + "samples=" + samples + + ", readLength=" + readLength + + ", alignmentStart=" + alignmentStart + + ", skipNLoci=" + skipNLoci + + ", nLoci=" + nLoci + + ", nReadsPerLocus=" + nReadsPerLocus + + '}'; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index 4af6555d9..0f5d6a2f7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -327,35 +327,6 @@ public class ArtificialSAMUtils { return stack; } - /** - * Create a read stream based on the parameters. The cigar string for each - * read will be *M, where * is the length of the read. - * - * Useful for testing things like LocusIteratorBystate - * - * @return a collection of stackSize reads all sharing the above properties - */ - public static List createReadStream( final int nReadsPerLocus, - final int nLoci, - final SAMFileHeader header, - final int alignmentStart, - final int length ) { - final String baseName = "read"; - List reads = new ArrayList(nReadsPerLocus*nLoci); - for ( int locus = 0; locus < nLoci; locus++ ) { - for ( int readI = 0; readI < nReadsPerLocus; readI++ ) { - for ( final SAMReadGroupRecord rg : header.getReadGroups() ) { - final String readName = String.format("%s.%d.%d.%s", baseName, locus, readI, rg.getId()); - final GATKSAMRecord read = createArtificialRead(header, readName, 0, alignmentStart + locus, length); - read.setReadGroup(new GATKSAMReadGroupRecord(rg)); - reads.add(read); - } - } - } - - return reads; - } - /** * create an iterator containing the specified read piles * diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index b61628d4d..1488f7269 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -169,8 +169,8 @@ public class ReadUtils { * @return whether or not the base is in the adaptor */ public static boolean isBaseInsideAdaptor(final GATKSAMRecord read, long basePos) { - Integer adaptorBoundary = getAdaptorBoundary(read); - if (adaptorBoundary == null || read.getInferredInsertSize() > DEFAULT_ADAPTOR_SIZE) + final int adaptorBoundary = getAdaptorBoundary(read); + if (adaptorBoundary == CANNOT_COMPUTE_ADAPTOR_BOUNDARY || read.getInferredInsertSize() > DEFAULT_ADAPTOR_SIZE) return false; return read.getReadNegativeStrandFlag() ? basePos <= adaptorBoundary : basePos >= adaptorBoundary; @@ -199,26 +199,28 @@ public class ReadUtils { * in these cases the adaptor boundary is at the start of the read plus the inferred insert size (plus one) * * @param read the read being tested for the adaptor boundary - * @return the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read. NULL if the read is unmapped or the mate is mapped to another contig. + * @return the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read. + * CANNOT_COMPUTE_ADAPTOR_BOUNDARY if the read is unmapped or the mate is mapped to another contig. */ - public static Integer getAdaptorBoundary(final SAMRecord read) { + public static int getAdaptorBoundary(final SAMRecord read) { final int MAXIMUM_ADAPTOR_LENGTH = 8; final int insertSize = Math.abs(read.getInferredInsertSize()); // the inferred insert size can be negative if the mate is mapped before the read (so we take the absolute value) if (insertSize == 0 || read.getReadUnmappedFlag()) // no adaptors in reads with mates in another chromosome or unmapped pairs - return null; + return CANNOT_COMPUTE_ADAPTOR_BOUNDARY; - Integer adaptorBoundary; // the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read) + int adaptorBoundary; // the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read) if (read.getReadNegativeStrandFlag()) adaptorBoundary = read.getMateAlignmentStart() - 1; // case 1 (see header) else adaptorBoundary = read.getAlignmentStart() + insertSize + 1; // case 2 (see header) if ( (adaptorBoundary < read.getAlignmentStart() - MAXIMUM_ADAPTOR_LENGTH) || (adaptorBoundary > read.getAlignmentEnd() + MAXIMUM_ADAPTOR_LENGTH) ) - adaptorBoundary = null; // we are being conservative by not allowing the adaptor boundary to go beyond what we belive is the maximum size of an adaptor + adaptorBoundary = CANNOT_COMPUTE_ADAPTOR_BOUNDARY; // we are being conservative by not allowing the adaptor boundary to go beyond what we belive is the maximum size of an adaptor return adaptorBoundary; } + public static int CANNOT_COMPUTE_ADAPTOR_BOUNDARY = Integer.MIN_VALUE; /** * is the read a 454 read? @@ -392,6 +394,11 @@ public class ReadUtils { return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refCoord, tail, false); } + public static int getReadCoordinateForReferenceCoordinateUpToEndOfRead(GATKSAMRecord read, int refCoord, ClippingTail tail) { + final int leftmostSafeVariantPosition = Math.max(read.getSoftStart(), refCoord); + return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), leftmostSafeVariantPosition, tail, false); + } + public static int getReadCoordinateForReferenceCoordinate(final int alignmentStart, final Cigar cigar, final int refCoord, final ClippingTail tail, final boolean allowGoalNotReached) { Pair result = getReadCoordinateForReferenceCoordinate(alignmentStart, cigar, refCoord, allowGoalNotReached); int readCoord = result.getFirst(); diff --git a/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java b/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java index 819041a3e..a6ac2ca53 100644 --- a/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java +++ b/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java @@ -26,6 +26,7 @@ package org.broadinstitute.variant.utils; import net.sf.samtools.util.StringUtil; +import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.Arrays; import java.util.Random; @@ -34,42 +35,66 @@ import java.util.Random; * BaseUtils contains some basic utilities for manipulating nucleotides. */ public class BaseUtils { - public final static byte A = (byte) 'A'; - public final static byte C = (byte) 'C'; - public final static byte G = (byte) 'G'; - public final static byte T = (byte) 'T'; - public final static byte N = (byte) 'N'; - public final static byte D = (byte) 'D'; + public enum Base { + A ((byte)'A'), + C ((byte)'C'), + G ((byte)'G'), + T ((byte)'T'), + N ((byte)'N'), + D ((byte)'D'); - // - // todo -- we need a generalized base abstraction using the Base enum. - // + public byte base; + + private Base(final byte base) { + this.base = base; + } + } + + // todo -- add this to the generalized base abstraction using the Base enum. public final static byte[] BASES = {'A', 'C', 'G', 'T'}; public final static byte[] EXTENDED_BASES = {'A', 'C', 'G', 'T', 'N', 'D'}; static private final int[] baseIndexMap = new int[256]; static { Arrays.fill(baseIndexMap, -1); - baseIndexMap['A'] = 0; - baseIndexMap['a'] = 0; - baseIndexMap['*'] = 0; // the wildcard character counts as an A - baseIndexMap['C'] = 1; - baseIndexMap['c'] = 1; - baseIndexMap['G'] = 2; - baseIndexMap['g'] = 2; - baseIndexMap['T'] = 3; - baseIndexMap['t'] = 3; + baseIndexMap['A'] = Base.A.ordinal(); + baseIndexMap['a'] = Base.A.ordinal(); + baseIndexMap['*'] = Base.A.ordinal(); // the wildcard character counts as an A + baseIndexMap['C'] = Base.C.ordinal(); + baseIndexMap['c'] = Base.C.ordinal(); + baseIndexMap['G'] = Base.G.ordinal(); + baseIndexMap['g'] = Base.G.ordinal(); + baseIndexMap['T'] = Base.T.ordinal(); + baseIndexMap['t'] = Base.T.ordinal(); } - // todo -- fix me (enums?) - public static final byte DELETION_INDEX = 4; - public static final byte NO_CALL_INDEX = 5; // (this is 'N') - - public static final int aIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'A'); - public static final int cIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'C'); - public static final int gIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'G'); - public static final int tIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'T'); + static private final int[] baseIndexWithIupacMap = baseIndexMap.clone(); + static { + baseIndexWithIupacMap['*'] = -1; // the wildcard character is bad + baseIndexWithIupacMap['N'] = Base.N.ordinal(); + baseIndexWithIupacMap['n'] = Base.N.ordinal(); + baseIndexWithIupacMap['R'] = Base.N.ordinal(); + baseIndexWithIupacMap['r'] = Base.N.ordinal(); + baseIndexWithIupacMap['Y'] = Base.N.ordinal(); + baseIndexWithIupacMap['y'] = Base.N.ordinal(); + baseIndexWithIupacMap['M'] = Base.N.ordinal(); + baseIndexWithIupacMap['m'] = Base.N.ordinal(); + baseIndexWithIupacMap['K'] = Base.N.ordinal(); + baseIndexWithIupacMap['k'] = Base.N.ordinal(); + baseIndexWithIupacMap['W'] = Base.N.ordinal(); + baseIndexWithIupacMap['w'] = Base.N.ordinal(); + baseIndexWithIupacMap['S'] = Base.N.ordinal(); + baseIndexWithIupacMap['s'] = Base.N.ordinal(); + baseIndexWithIupacMap['B'] = Base.N.ordinal(); + baseIndexWithIupacMap['b'] = Base.N.ordinal(); + baseIndexWithIupacMap['D'] = Base.N.ordinal(); + baseIndexWithIupacMap['d'] = Base.N.ordinal(); + baseIndexWithIupacMap['H'] = Base.N.ordinal(); + baseIndexWithIupacMap['h'] = Base.N.ordinal(); + baseIndexWithIupacMap['V'] = Base.N.ordinal(); + baseIndexWithIupacMap['v'] = Base.N.ordinal(); + } // Use a fixed random seed to allow for deterministic results when using random bases private static final Random randomNumberGen = new Random(47382911L); @@ -96,10 +121,10 @@ public class BaseUtils { } public static boolean isTransition(byte base1, byte base2) { - int b1 = simpleBaseToBaseIndex(base1); - int b2 = simpleBaseToBaseIndex(base2); - return b1 == 0 && b2 == 2 || b1 == 2 && b2 == 0 || - b1 == 1 && b2 == 3 || b1 == 3 && b2 == 1; + final int b1 = simpleBaseToBaseIndex(base1); + final int b2 = simpleBaseToBaseIndex(base2); + return b1 == Base.A.ordinal() && b2 == Base.G.ordinal() || b1 == Base.G.ordinal() && b2 == Base.A.ordinal() || + b1 == Base.C.ordinal() && b2 == Base.T.ordinal() || b1 == Base.T.ordinal() && b2 == Base.C.ordinal(); } public static boolean isTransversion(byte base1, byte base2) { @@ -141,6 +166,21 @@ public class BaseUtils { return base >= 'A' && base <= 'Z'; } + public static byte[] convertIUPACtoN(final byte[] bases, final boolean errorOnBadReferenceBase, final boolean ignoreConversionOfFirstByte) { + final int length = bases.length; + final int start = ignoreConversionOfFirstByte ? 1 : 0; + + for ( int i = start; i < length; i++ ) { + final int baseIndex = baseIndexWithIupacMap[bases[i]]; + if ( baseIndex == Base.N.ordinal() ) { + bases[i] = 'N'; + } else if ( errorOnBadReferenceBase && baseIndex == -1 ) { + throw new UserException.BadInput("We encountered a non-standard non-IUPAC base in the provided reference: '" + bases[i] + "'"); + } + } + return bases; + } + /** * Converts a IUPAC nucleotide code to a pair of bases * @@ -231,10 +271,10 @@ public class BaseUtils { switch (base) { case 'd': case 'D': - return DELETION_INDEX; + return Base.D.ordinal(); case 'n': case 'N': - return NO_CALL_INDEX; + return Base.N.ordinal(); default: return simpleBaseToBaseIndex(base); diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/Allele.java b/public/java/src/org/broadinstitute/variant/variantcontext/Allele.java index 33bca1a8a..0a0b4d0b7 100644 --- a/public/java/src/org/broadinstitute/variant/variantcontext/Allele.java +++ b/public/java/src/org/broadinstitute/variant/variantcontext/Allele.java @@ -111,7 +111,7 @@ public class Allele implements Comparable { /** A generic static NO_CALL allele for use */ // no public way to create an allele - private Allele(byte[] bases, boolean isRef) { + protected Allele(byte[] bases, boolean isRef) { // null alleles are no longer allowed if ( wouldBeNullAllele(bases) ) { throw new IllegalArgumentException("Null alleles are not supported"); @@ -140,7 +140,7 @@ public class Allele implements Comparable { throw new IllegalArgumentException("Unexpected base in allele bases \'" + new String(bases)+"\'"); } - private Allele(String bases, boolean isRef) { + protected Allele(String bases, boolean isRef) { this(bases.getBytes(), isRef); } diff --git a/public/java/src/org/broadinstitute/variant/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/variant/vcf/VCFHeader.java index 583a01417..9bdb86a48 100644 --- a/public/java/src/org/broadinstitute/variant/vcf/VCFHeader.java +++ b/public/java/src/org/broadinstitute/variant/vcf/VCFHeader.java @@ -73,6 +73,10 @@ public class VCFHeader { public static final String REFERENCE_KEY = "reference"; public static final String CONTIG_KEY = "contig"; public static final String INTERVALS_KEY = "intervals"; + public static final String EXCLUDE_INTERVALS_KEY = "excludeIntervals"; + public static final String INTERVAL_MERGING_KEY = "interval_merging"; + public static final String INTERVAL_SET_RULE_KEY = "interval_set_rule"; + public static final String INTERVAL_PADDING_KEY = "interval_padding"; // were the input samples sorted originally (or are we sorting them)? private boolean samplesWereAlreadySorted = true; diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java new file mode 100644 index 000000000..bc1e1d7b0 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.traversals; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; +import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; + +import java.util.*; + +/** + * ActiveRegionWalker for unit testing + * + * User: depristo + * Date: 1/15/13 + * Time: 1:28 PM + */ +class DummyActiveRegionWalker extends ActiveRegionWalker { + private final double prob; + private EnumSet states = super.desiredReadStates(); + private GenomeLocSortedSet activeRegions = null; + + protected List isActiveCalls = new ArrayList(); + protected Map mappedActiveRegions = new LinkedHashMap(); + + public DummyActiveRegionWalker() { + this(1.0); + } + + public DummyActiveRegionWalker(double constProb) { + this.prob = constProb; + } + + public DummyActiveRegionWalker(EnumSet wantStates) { + this(1.0); + this.states = wantStates; + } + + public DummyActiveRegionWalker(GenomeLocSortedSet activeRegions) { + this(1.0); + this.activeRegions = activeRegions; + } + + public void setStates(EnumSet states) { + this.states = states; + } + + @Override + public EnumSet desiredReadStates() { + return states; + } + + @Override + public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + isActiveCalls.add(ref.getLocus()); + final double p = activeRegions == null || activeRegions.overlaps(ref.getLocus()) ? prob : 0.0; + return new ActivityProfileResult(ref.getLocus(), p); + } + + @Override + public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) { + mappedActiveRegions.put(activeRegion.getLocation(), activeRegion); + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return 0; + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginalUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginalUnitTest.java deleted file mode 100644 index 35a0931df..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginalUnitTest.java +++ /dev/null @@ -1,523 +0,0 @@ -/* - * Copyright (c) 2012 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.traversals; - -import com.google.java.contract.PreconditionError; -import net.sf.samtools.*; -import org.broadinstitute.sting.commandline.Tags; -import org.broadinstitute.sting.gatk.datasources.reads.*; -import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; -import org.broadinstitute.sting.utils.interval.IntervalMergingRule; -import org.broadinstitute.sting.utils.interval.IntervalUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.executive.WindowMaker; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.*; - - -/** - * Created with IntelliJ IDEA. - * User: depristo - * Date: 1/10/13 - * Time: 8:03 PM - * To change this template use File | Settings | File Templates. - */ -public class TraverseActiveRegionsOriginalUnitTest extends BaseTest { - - private class DummyActiveRegionWalker extends ActiveRegionWalker { - private final double prob; - private EnumSet states = super.desiredReadStates(); - - protected List isActiveCalls = new ArrayList(); - protected Map mappedActiveRegions = new HashMap(); - - public DummyActiveRegionWalker() { - this.prob = 1.0; - } - - public DummyActiveRegionWalker(double constProb) { - this.prob = constProb; - } - - public DummyActiveRegionWalker(EnumSet wantStates) { - this.prob = 1.0; - this.states = wantStates; - } - - @Override - public EnumSet desiredReadStates() { - return states; - } - - @Override - public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - isActiveCalls.add(ref.getLocus()); - return new ActivityProfileResult(ref.getLocus(), prob); - } - - @Override - public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) { - mappedActiveRegions.put(activeRegion.getLocation(), activeRegion); - return 0; - } - - @Override - public Integer reduceInit() { - return 0; - } - - @Override - public Integer reduce(Integer value, Integer sum) { - return 0; - } - } - - private final TraverseActiveRegions t = new TraverseActiveRegionsOriginal(); - - private IndexedFastaSequenceFile reference; - private SAMSequenceDictionary dictionary; - private GenomeLocParser genomeLocParser; - - private List intervals; - - private static final String testBAM = "TraverseActiveRegionsUnitTest.bam"; - private static final String testBAI = "TraverseActiveRegionsUnitTest.bai"; - - @BeforeClass - private void init() throws FileNotFoundException { - reference = new CachingIndexedFastaSequenceFile(new File(hg19Reference)); - dictionary = reference.getSequenceDictionary(); - genomeLocParser = new GenomeLocParser(dictionary); - - // TODO: reads with indels - // TODO: reads which span many regions - // TODO: reads which are partially between intervals (in/outside extension) - // TODO: duplicate reads - // TODO: read at the end of a contig - // TODO: reads which are completely outside intervals but within extension - // TODO: test the extension itself - // TODO: unmapped reads - - intervals = new ArrayList(); - intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20)); - intervals.add(genomeLocParser.createGenomeLoc("1", 1, 999)); - intervals.add(genomeLocParser.createGenomeLoc("1", 1000, 1999)); - intervals.add(genomeLocParser.createGenomeLoc("1", 2000, 2999)); - intervals.add(genomeLocParser.createGenomeLoc("1", 10000, 20000)); - intervals.add(genomeLocParser.createGenomeLoc("2", 1, 100)); - intervals.add(genomeLocParser.createGenomeLoc("20", 10000, 10100)); - intervals = IntervalUtils.sortAndMergeIntervals(genomeLocParser, intervals, IntervalMergingRule.OVERLAPPING_ONLY).toList(); - - List reads = new ArrayList(); - reads.add(buildSAMRecord("simple", "1", 100, 200)); - reads.add(buildSAMRecord("overlap_equal", "1", 10, 20)); - reads.add(buildSAMRecord("overlap_unequal", "1", 10, 21)); - reads.add(buildSAMRecord("boundary_equal", "1", 1990, 2009)); - reads.add(buildSAMRecord("boundary_unequal", "1", 1990, 2008)); - reads.add(buildSAMRecord("boundary_1_pre", "1", 1950, 2000)); - reads.add(buildSAMRecord("boundary_1_post", "1", 1999, 2050)); - reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990)); - reads.add(buildSAMRecord("outside_intervals", "1", 5000, 6000)); - reads.add(buildSAMRecord("shard_boundary_1_pre", "1", 16300, 16385)); - reads.add(buildSAMRecord("shard_boundary_1_post", "1", 16384, 16400)); - reads.add(buildSAMRecord("shard_boundary_equal", "1", 16355, 16414)); - reads.add(buildSAMRecord("simple20", "20", 10025, 10075)); - - createBAM(reads); - } - - private void createBAM(List reads) { - File outFile = new File(testBAM); - outFile.deleteOnExit(); - File indexFile = new File(testBAI); - indexFile.deleteOnExit(); - - SAMFileWriter out = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(reads.get(0).getHeader(), true, outFile); - for (GATKSAMRecord read : ReadUtils.sortReadsByCoordinate(reads)) { - out.addAlignment(read); - } - out.close(); - } - - @Test - public void testAllBasesSeen() { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); - - List activeIntervals = getIsActiveIntervals(walker, intervals); - // Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call - verifyEqualIntervals(intervals, activeIntervals); - } - - private List getIsActiveIntervals(DummyActiveRegionWalker walker, List intervals) { - List activeIntervals = new ArrayList(); - for (LocusShardDataProvider dataProvider : createDataProviders(walker, intervals, testBAM)) { - t.traverse(walker, dataProvider, 0); - activeIntervals.addAll(walker.isActiveCalls); - } - - return activeIntervals; - } - - @Test (expectedExceptions = PreconditionError.class) - public void testIsActiveRangeLow () { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(-0.1); - getActiveRegions(walker, intervals).values(); - } - - @Test (expectedExceptions = PreconditionError.class) - public void testIsActiveRangeHigh () { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(1.1); - getActiveRegions(walker, intervals).values(); - } - - @Test - public void testActiveRegionCoverage() { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); - - Collection activeRegions = getActiveRegions(walker, intervals).values(); - verifyActiveRegionCoverage(intervals, activeRegions); - } - - private void verifyActiveRegionCoverage(List intervals, Collection activeRegions) { - List intervalStarts = new ArrayList(); - List intervalStops = new ArrayList(); - - for (GenomeLoc interval : intervals) { - intervalStarts.add(interval.getStartLocation()); - intervalStops.add(interval.getStopLocation()); - } - - Map baseRegionMap = new HashMap(); - - for (ActiveRegion activeRegion : activeRegions) { - for (GenomeLoc activeLoc : toSingleBaseLocs(activeRegion.getLocation())) { - // Contract: Regions do not overlap - Assert.assertFalse(baseRegionMap.containsKey(activeLoc), "Genome location " + activeLoc + " is assigned to more than one region"); - baseRegionMap.put(activeLoc, activeRegion); - } - - GenomeLoc start = activeRegion.getLocation().getStartLocation(); - if (intervalStarts.contains(start)) - intervalStarts.remove(start); - - GenomeLoc stop = activeRegion.getLocation().getStopLocation(); - if (intervalStops.contains(stop)) - intervalStops.remove(stop); - } - - for (GenomeLoc baseLoc : toSingleBaseLocs(intervals)) { - // Contract: Each location in the interval(s) is in exactly one region - // Contract: The total set of regions exactly matches the analysis interval(s) - Assert.assertTrue(baseRegionMap.containsKey(baseLoc), "Genome location " + baseLoc + " is not assigned to any region"); - baseRegionMap.remove(baseLoc); - } - - // Contract: The total set of regions exactly matches the analysis interval(s) - Assert.assertEquals(baseRegionMap.size(), 0, "Active regions contain base(s) outside of the given intervals"); - - // Contract: All explicit interval boundaries must also be region boundaries - Assert.assertEquals(intervalStarts.size(), 0, "Interval start location does not match an active region start location"); - Assert.assertEquals(intervalStops.size(), 0, "Interval stop location does not match an active region stop location"); - } - - @Test - public void testActiveRegionExtensionOnContig() { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); - - Collection activeRegions = getActiveRegions(walker, intervals).values(); - for (ActiveRegion activeRegion : activeRegions) { - GenomeLoc loc = activeRegion.getExtendedLoc(); - - // Contract: active region extensions must stay on the contig - Assert.assertTrue(loc.getStart() > 0, "Active region extension begins at location " + loc.getStart() + ", past the left end of the contig"); - int refLen = dictionary.getSequence(loc.getContigIndex()).getSequenceLength(); - Assert.assertTrue(loc.getStop() <= refLen, "Active region extension ends at location " + loc.getStop() + ", past the right end of the contig"); - } - } - - @Test - public void testPrimaryReadMapping() { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); - - // Contract: Each read has the Primary state in a single region (or none) - // This is the region of maximum overlap for the read (earlier if tied) - - // simple: Primary in 1:1-999 - // overlap_equal: Primary in 1:1-999 - // overlap_unequal: Primary in 1:1-999 - // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 - // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 - // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 - // outside_intervals: none - // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 - // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 - // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 - // simple20: Primary in 20:10000-10100 - - Map activeRegions = getActiveRegions(walker, intervals); - ActiveRegion region; - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); - verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); - verifyReadMapping(region, "boundary_unequal", "extended_and_np", "boundary_1_pre"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); - verifyReadMapping(region, "boundary_equal", "boundary_1_post"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384)); - verifyReadMapping(region, "shard_boundary_1_pre"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927)); - verifyReadMapping(region, "shard_boundary_1_post", "shard_boundary_equal"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); - verifyReadMapping(region, "simple20"); - } - - @Test - public void testNonPrimaryReadMapping() { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker( - EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY)); - - // Contract: Each read has the Primary state in a single region (or none) - // This is the region of maximum overlap for the read (earlier if tied) - - // Contract: Each read has the Non-Primary state in all other regions it overlaps - - // simple: Primary in 1:1-999 - // overlap_equal: Primary in 1:1-999 - // overlap_unequal: Primary in 1:1-999 - // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 - // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 - // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 - // outside_intervals: none - // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 - // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 - // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 - // simple20: Primary in 20:10000-10100 - - Map activeRegions = getActiveRegions(walker, intervals); - ActiveRegion region; - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); - verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); - verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); - verifyReadMapping(region, "boundary_equal", "boundary_unequal", "boundary_1_pre", "boundary_1_post"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384)); - verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927)); - verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); - verifyReadMapping(region, "simple20"); - } - - @Test - public void testExtendedReadMapping() { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker( - EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED)); - - // Contract: Each read has the Primary state in a single region (or none) - // This is the region of maximum overlap for the read (earlier if tied) - - // Contract: Each read has the Non-Primary state in all other regions it overlaps - // Contract: Each read has the Extended state in regions where it only overlaps if the region is extended - - // simple: Primary in 1:1-999 - // overlap_equal: Primary in 1:1-999 - // overlap_unequal: Primary in 1:1-999 - // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 - // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 - // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 - // outside_intervals: none - // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 - // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 - // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 - // simple20: Primary in 20:10000-10100 - - Map activeRegions = getActiveRegions(walker, intervals); - ActiveRegion region; - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); - verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); - verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); - verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384)); - verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927)); - verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); - verifyReadMapping(region, "simple20"); - } - - @Test - public void testUnmappedReads() { - // TODO - } - - private void verifyReadMapping(ActiveRegion region, String... reads) { - Collection wantReads = new ArrayList(Arrays.asList(reads)); - for (SAMRecord read : region.getReads()) { - String regionReadName = read.getReadName(); - Assert.assertTrue(wantReads.contains(regionReadName), "Read " + regionReadName + " assigned to active region " + region); - wantReads.remove(regionReadName); - } - - Assert.assertTrue(wantReads.isEmpty(), "Reads missing in active region " + region); - } - - private Map getActiveRegions(DummyActiveRegionWalker walker, List intervals) { - for (LocusShardDataProvider dataProvider : createDataProviders(walker, intervals, testBAM)) - t.traverse(walker, dataProvider, 0); - - t.endTraversal(walker, 0); - - return walker.mappedActiveRegions; - } - - private Collection toSingleBaseLocs(GenomeLoc interval) { - List bases = new ArrayList(); - if (interval.size() == 1) - bases.add(interval); - else { - for (int location = interval.getStart(); location <= interval.getStop(); location++) - bases.add(genomeLocParser.createGenomeLoc(interval.getContig(), location, location)); - } - - return bases; - } - - private Collection toSingleBaseLocs(List intervals) { - Set bases = new TreeSet(); // for sorting and uniqueness - for (GenomeLoc interval : intervals) - bases.addAll(toSingleBaseLocs(interval)); - - return bases; - } - - private void verifyEqualIntervals(List aIntervals, List bIntervals) { - Collection aBases = toSingleBaseLocs(aIntervals); - Collection bBases = toSingleBaseLocs(bIntervals); - - Assert.assertTrue(aBases.size() == bBases.size(), "Interval lists have a differing number of bases: " + aBases.size() + " vs. " + bBases.size()); - - Iterator aIter = aBases.iterator(); - Iterator bIter = bBases.iterator(); - while (aIter.hasNext() && bIter.hasNext()) { - GenomeLoc aLoc = aIter.next(); - GenomeLoc bLoc = bIter.next(); - Assert.assertTrue(aLoc.equals(bLoc), "Interval locations do not match: " + aLoc + " vs. " + bLoc); - } - } - - // copied from LocusViewTemplate - protected GATKSAMRecord buildSAMRecord(String readName, String contig, int alignmentStart, int alignmentEnd) { - SAMFileHeader header = ArtificialSAMUtils.createDefaultReadGroup(new SAMFileHeader(), "test", "test"); - header.setSequenceDictionary(dictionary); - header.setSortOrder(SAMFileHeader.SortOrder.coordinate); - GATKSAMRecord record = new GATKSAMRecord(header); - - record.setReadName(readName); - record.setReferenceIndex(dictionary.getSequenceIndex(contig)); - record.setAlignmentStart(alignmentStart); - - Cigar cigar = new Cigar(); - int len = alignmentEnd - alignmentStart + 1; - cigar.add(new CigarElement(len, CigarOperator.M)); - record.setCigar(cigar); - record.setReadString(new String(new char[len]).replace("\0", "A")); - record.setBaseQualities(new byte[len]); - - return record; - } - - private List createDataProviders(final Walker walker, List intervals, String bamFile) { - GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); - engine.setGenomeLocParser(genomeLocParser); - t.initialize(engine, walker); - - Collection samFiles = new ArrayList(); - SAMReaderID readerID = new SAMReaderID(new File(bamFile), new Tags()); - samFiles.add(readerID); - - SAMDataSource dataSource = new SAMDataSource(samFiles, new ThreadAllocation(), null, genomeLocParser); - - List providers = new ArrayList(); - for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) { - for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs())) { - providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList())); - } - } - - return providers; - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimizedUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java similarity index 70% rename from public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimizedUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index 038cd2853..319af5ec5 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimizedUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -30,33 +30,26 @@ import net.sf.samtools.*; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.datasources.reads.*; -import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.*; import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.executive.WindowMaker; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.ReadUtils; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; @@ -76,62 +69,14 @@ import java.util.*; * Test the Active Region Traversal Contract * http://iwww.broadinstitute.org/gsa/wiki/index.php/Active_Region_Traversal_Contract */ -public class TraverseActiveRegionsOptimizedUnitTest extends BaseTest { +public class TraverseActiveRegionsUnitTest extends BaseTest { private final static boolean ENFORCE_CONTRACTS = false; private final static boolean DEBUG = false; - private class DummyActiveRegionWalker extends ActiveRegionWalker { - private final double prob; - private EnumSet states = super.desiredReadStates(); - - protected List isActiveCalls = new ArrayList(); - protected Map mappedActiveRegions = new HashMap(); - - public DummyActiveRegionWalker() { - this.prob = 1.0; - } - - public DummyActiveRegionWalker(double constProb) { - this.prob = constProb; - } - - public DummyActiveRegionWalker(EnumSet wantStates) { - this.prob = 1.0; - this.states = wantStates; - } - - @Override - public EnumSet desiredReadStates() { - return states; - } - - @Override - public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - isActiveCalls.add(ref.getLocus()); - return new ActivityProfileResult(ref.getLocus(), prob); - } - - @Override - public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) { - mappedActiveRegions.put(activeRegion.getLocation(), activeRegion); - return 0; - } - - @Override - public Integer reduceInit() { - return 0; - } - - @Override - public Integer reduce(Integer value, Integer sum) { - return 0; - } - } - @DataProvider(name = "TraversalEngineProvider") public Object[][] makeTraversals() { final List traversals = new LinkedList(); - traversals.add(new Object[]{new TraverseActiveRegionsOptimized()}); + traversals.add(new Object[]{new TraverseActiveRegions()}); return traversals.toArray(new Object[][]{}); } @@ -297,7 +242,7 @@ public class TraverseActiveRegionsOptimizedUnitTest extends BaseTest { } } - @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") + @Test(enabled = true, dataProvider = "TraversalEngineProvider") public void testPrimaryReadMapping(TraverseActiveRegions t) { DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); @@ -340,7 +285,7 @@ public class TraverseActiveRegionsOptimizedUnitTest extends BaseTest { verifyReadMapping(region, "simple20"); } - @Test(enabled = true, dataProvider = "TraversalEngineProvider") + @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") public void testNonPrimaryReadMapping(TraverseActiveRegions t) { DummyActiveRegionWalker walker = new DummyActiveRegionWalker( EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY)); @@ -456,7 +401,11 @@ public class TraverseActiveRegionsOptimizedUnitTest extends BaseTest { } private Map getActiveRegions(TraverseActiveRegions t, DummyActiveRegionWalker walker, List intervals) { - for (LocusShardDataProvider dataProvider : createDataProviders(t, walker, intervals, testBAM)) + return getActiveRegions(t, walker, intervals, testBAM); + } + + private Map getActiveRegions(TraverseActiveRegions t, DummyActiveRegionWalker walker, List intervals, final String bam) { + for (LocusShardDataProvider dataProvider : createDataProviders(t, walker, intervals, bam)) t.traverse(walker, dataProvider, 0); t.endTraversal(walker, 0); @@ -516,14 +465,15 @@ public class TraverseActiveRegionsOptimizedUnitTest extends BaseTest { record.setCigar(cigar); record.setReadString(new String(new char[len]).replace("\0", "A")); record.setBaseQualities(new byte[len]); + record.setReadGroup(new GATKSAMReadGroupRecord(header.getReadGroup("test"))); return record; } - private List createDataProviders(TraverseActiveRegions t, final Walker walker, List intervals, String bamFile) { + private List createDataProviders(TraverseActiveRegions traverseActiveRegions, final Walker walker, List intervals, String bamFile) { GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); engine.setGenomeLocParser(genomeLocParser); - t.initialize(engine, walker); + traverseActiveRegions.initialize(engine, walker); Collection samFiles = new ArrayList(); SAMReaderID readerID = new SAMReaderID(new File(bamFile), new Tags()); @@ -537,15 +487,201 @@ public class TraverseActiveRegionsOptimizedUnitTest extends BaseTest { new ValidationExclusion(), new ArrayList(), new ArrayList(), - false, (byte)30, false, t instanceof TraverseActiveRegionsOptimized); + false, (byte)30, false, true); + + final Set samples = SampleUtils.getSAMFileSamples(dataSource.getHeader()); List providers = new ArrayList(); for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) { - for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs())) { + for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs(), samples)) { providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList())); } } return providers; } + + // --------------------------------------------------------------------------------------------------------- + // + // Combinatorial tests to ensure reads are going into the right regions + // + // --------------------------------------------------------------------------------------------------------- + + @DataProvider(name = "CombinatorialARTTilingProvider") + public Object[][] makeCombinatorialARTTilingProvider() { + final List tests = new LinkedList(); + + final List starts = Arrays.asList( + 1, // very start of the chromosome + ArtificialBAMBuilder.BAM_SHARD_SIZE - 100, // right before the shard boundary + ArtificialBAMBuilder.BAM_SHARD_SIZE + 100 // right after the shard boundary + ); + + final List> allReadStates = Arrays.asList( + EnumSet.of(ActiveRegionReadState.PRIMARY), + EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY), + EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED) + ); + + final int maxTests = Integer.MAX_VALUE; + int nTests = 0; + for ( final int readLength : Arrays.asList(10, 100) ) { + for ( final int skips : Arrays.asList(0, 1, 10) ) { + for ( final int start : starts ) { + for ( final int nReadsPerLocus : Arrays.asList(1, 2) ) { + for ( final int nLoci : Arrays.asList(1, 1000) ) { + for ( EnumSet readStates : allReadStates ) { + final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(reference, nReadsPerLocus, nLoci); + bamBuilder.setReadLength(readLength); + bamBuilder.setSkipNLoci(skips); + bamBuilder.setAlignmentStart(start); + + for ( final GenomeLocSortedSet activeRegions : enumerateActiveRegions(bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd())) { + nTests++; + if ( nTests < maxTests ) // && nTests == 1238 ) + tests.add(new Object[]{nTests, activeRegions, readStates, bamBuilder}); + } + } + } + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + private Collection enumerateActiveRegions(final int start, final int stop) { + // should basically cut up entire region into equal sized chunks, of + // size 10, 20, 50, 100, etc, alternating skipping pieces so they are inactive + // Need to make sure we include some edge cases: + final List activeRegions = new LinkedList(); + + for ( final int stepSize : Arrays.asList(11, 29, 53, 97) ) { + for ( final boolean startWithActive : Arrays.asList(true, false) ) { + activeRegions.add(makeActiveRegionMask(start, stop, stepSize, startWithActive)); + } + } + + // active region is the whole interval + activeRegions.add(new GenomeLocSortedSet(genomeLocParser, genomeLocParser.createGenomeLoc("1", start, stop))); + + // active region extends up to the end of the data, but doesn't include start + activeRegions.add(new GenomeLocSortedSet(genomeLocParser, genomeLocParser.createGenomeLoc("1", start+10, stop))); + + return activeRegions; + } + + private GenomeLocSortedSet makeActiveRegionMask(final int start, final int stop, final int stepSize, final boolean startWithActive) { + final GenomeLocSortedSet active = new GenomeLocSortedSet(genomeLocParser); + + boolean includeRegion = startWithActive; + for ( int left = start; left < stop; left += stepSize) { + final int right = left + stepSize; + final GenomeLoc region = genomeLocParser.createGenomeLoc("1", left, right); + if ( includeRegion ) + active.add(region); + includeRegion = ! includeRegion; + } + + return active; + } + + + @Test(enabled = true && ! DEBUG, dataProvider = "CombinatorialARTTilingProvider") + public void testARTReadsInActiveRegions(final int id, final GenomeLocSortedSet activeRegions, final EnumSet readStates, final ArtificialBAMBuilder bamBuilder) { + logger.warn("Running testARTReadsInActiveRegions id=" + id + " locs " + activeRegions + " against bam " + bamBuilder); + final List intervals = Arrays.asList( + genomeLocParser.createGenomeLoc("1", bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd()) + ); + + final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions); + walker.setStates(readStates); + + final TraverseActiveRegions traversal = new TraverseActiveRegions(); + final Map activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile().toString()); + + final Set alreadySeenReads = new HashSet(); // for use with the primary / non-primary + for ( final ActiveRegion region : activeRegionsMap.values() ) { + final Set readNamesInRegion = readNamesInRegion(region); + int nReadsExpectedInRegion = 0; + for ( final GATKSAMRecord read : bamBuilder.makeReads() ) { + final GenomeLoc readLoc = genomeLocParser.createGenomeLoc(read); + + boolean shouldBeInRegion = readStates.contains(ActiveRegionReadState.EXTENDED) + ? region.getExtendedLoc().overlapsP(readLoc) + : region.getLocation().overlapsP(readLoc); + + if ( ! readStates.contains(ActiveRegionReadState.NONPRIMARY) ) { + if ( alreadySeenReads.contains(read.getReadName()) ) + shouldBeInRegion = false; + else if ( shouldBeInRegion ) + alreadySeenReads.add(read.getReadName()); + } + + Assert.assertEquals(readNamesInRegion.contains(read.getReadName()), shouldBeInRegion, "Region " + region + + " failed contains read check: read " + read + " with span " + readLoc + " should be in region is " + shouldBeInRegion + " but I got the opposite"); + + nReadsExpectedInRegion += shouldBeInRegion ? 1 : 0; + } + + Assert.assertEquals(region.size(), nReadsExpectedInRegion, "There are more reads in active region " + region + "than expected"); + } + } + + private Set readNamesInRegion(final ActiveRegion region) { + final Set readNames = new LinkedHashSet(region.getReads().size()); + for ( final SAMRecord read : region.getReads() ) + readNames.add(read.getReadName()); + return readNames; + } + + // --------------------------------------------------------------------------------------------------------- + // + // Make sure all insertion reads are properly included in the active regions + // + // --------------------------------------------------------------------------------------------------------- + + @Test + public void ensureAllInsertionReadsAreInActiveRegions() { + + final int readLength = 10; + final int start = 20; + final int nReadsPerLocus = 10; + final int nLoci = 3; + + final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(reference, nReadsPerLocus, nLoci); + bamBuilder.setReadLength(readLength); + bamBuilder.setAlignmentStart(start); + + // note that the position must be +1 as the read's all I cigar puts the end 1 bp before start, leaving it out of the region + GATKSAMRecord allI = ArtificialSAMUtils.createArtificialRead(bamBuilder.getHeader(),"allI",0,start+1,readLength); + allI.setCigarString(readLength + "I"); + allI.setReadGroup(new GATKSAMReadGroupRecord(bamBuilder.getHeader().getReadGroups().get(0))); + + bamBuilder.addReads(allI); + + final GenomeLocSortedSet activeRegions = new GenomeLocSortedSet(bamBuilder.getGenomeLocParser()); + activeRegions.add(bamBuilder.getGenomeLocParser().createGenomeLoc("1", 10, 30)); + final List intervals = Arrays.asList( + genomeLocParser.createGenomeLoc("1", bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd()) + ); + + final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions); + + final TraverseActiveRegions traversal = new TraverseActiveRegions(); + final Map activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile().toString()); + + final ActiveRegion region = activeRegionsMap.values().iterator().next(); + int nReadsExpectedInRegion = 0; + + final Set readNamesInRegion = readNamesInRegion(region); + for ( final GATKSAMRecord read : bamBuilder.makeReads() ) { + Assert.assertTrue(readNamesInRegion.contains(read.getReadName()), + "Region " + region + " should contain read " + read + " with cigar " + read.getCigarString() + " but it wasn't"); + nReadsExpectedInRegion++; + } + + Assert.assertEquals(region.size(), nReadsExpectedInRegion, "There are more reads in active region " + region + "than expected"); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java index c67e52f2e..0c1b5b069 100644 --- a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java @@ -32,8 +32,10 @@ package org.broadinstitute.sting.utils.fasta; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.ReferenceSequence; import net.sf.samtools.SAMSequenceRecord; +import org.apache.commons.lang.StringUtils; import org.apache.log4j.Priority; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -49,7 +51,7 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; /** - * Basic unit test for GenomeLoc + * Basic unit test for CachingIndexedFastaSequenceFile */ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { private File simpleFasta = new File(publicTestDir + "/exampleFASTA.fasta"); @@ -80,7 +82,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { @Test(dataProvider = "fastas", enabled = true && ! DEBUG) public void testCachingIndexedFastaReaderSequential1(File fasta, int cacheSize, int querySize) throws FileNotFoundException { - final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true); + final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false); SAMSequenceRecord contig = caching.getSequenceDictionary().getSequence(0); logger.warn(String.format("Checking contig %s length %d with cache size %d and query size %d", @@ -122,7 +124,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { @Test(dataProvider = "fastas", enabled = true && ! DEBUG) public void testCachingIndexedFastaReaderTwoStage(File fasta, int cacheSize, int querySize) throws FileNotFoundException { final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta); - final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true); + final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false); SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0); @@ -167,7 +169,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { @Test(dataProvider = "ParallelFastaTest", enabled = true && ! DEBUG, timeOut = 60000) public void testCachingIndexedFastaReaderParallel(final File fasta, final int cacheSize, final int querySize, final int nt) throws FileNotFoundException, InterruptedException { - final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true); + final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false); logger.warn(String.format("Parallel caching index fasta reader test cacheSize %d querySize %d nt %d", caching.getCacheSize(), querySize, nt)); for ( int iterations = 0; iterations < 1; iterations++ ) { @@ -230,4 +232,33 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { else return new String(reader.getSubsequenceAt(contig, start, stop).getBases()); } + + @Test(enabled = true) + public void testIupacChanges() throws FileNotFoundException, InterruptedException { + final String testFasta = privateTestDir + "iupacFASTA.fasta"; + final CachingIndexedFastaSequenceFile iupacPreserving = new CachingIndexedFastaSequenceFile(new File(testFasta), CachingIndexedFastaSequenceFile.DEFAULT_CACHE_SIZE, false, true); + final CachingIndexedFastaSequenceFile makeNs = new CachingIndexedFastaSequenceFile(new File(testFasta)); + + int preservingNs = 0; + int changingNs = 0; + for ( SAMSequenceRecord contig : iupacPreserving.getSequenceDictionary().getSequences() ) { + final String sPreserving = fetchBaseString(iupacPreserving, contig.getSequenceName(), 0, 15000); + preservingNs += StringUtils.countMatches(sPreserving, "N"); + + final String sChanging = fetchBaseString(makeNs, contig.getSequenceName(), 0, 15000); + changingNs += StringUtils.countMatches(sChanging, "N"); + } + + Assert.assertEquals(changingNs, preservingNs + 4); + } + + @Test(enabled = true, expectedExceptions = {UserException.class}) + public void testFailOnBadBase() throws FileNotFoundException, InterruptedException { + final String testFasta = privateTestDir + "problematicFASTA.fasta"; + final CachingIndexedFastaSequenceFile fasta = new CachingIndexedFastaSequenceFile(new File(testFasta)); + + for ( SAMSequenceRecord contig : fasta.getSequenceDictionary().getSequences() ) { + fetchBaseString(fasta, contig.getSequenceName(), -1, -1); + } + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index 35f9d4137..2be2745de 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -1068,7 +1068,7 @@ public class IntervalUtilsUnitTest extends BaseTest { List> intervalArgs = new ArrayList>(1); intervalArgs.add(new IntervalBinding(picardIntervalFile.getAbsolutePath())); - IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, genomeLocParser); + IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalArguments.intervalSetRule, argCollection.intervalArguments.intervalMerging, argCollection.intervalArguments.intervalPadding, genomeLocParser); } @Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData") @@ -1081,7 +1081,7 @@ public class IntervalUtilsUnitTest extends BaseTest { List> intervalArgs = new ArrayList>(1); intervalArgs.add(new IntervalBinding(gatkIntervalFile.getAbsolutePath())); - IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, genomeLocParser); + IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalArguments.intervalSetRule, argCollection.intervalArguments.intervalMerging, argCollection.intervalArguments.intervalPadding, genomeLocParser); } private File createTempFile( String tempFilePrefix, String tempFileExtension, String... lines ) throws Exception { diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java index 47e386ab5..e5e28e1f6 100644 --- a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java @@ -27,7 +27,7 @@ package org.broadinstitute.sting.utils.locusiterator; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMReadGroupRecord; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.downsampling.DownsampleType; @@ -37,6 +37,7 @@ import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.ArtificialBAMBuilder; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -53,6 +54,32 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { private static final boolean DEBUG = false; protected LocusIteratorByState li; + @Test(enabled = true) + public void testUnmappedAndAllIReadsPassThrough() { + final int readLength = 10; + GATKSAMRecord mapped1 = ArtificialSAMUtils.createArtificialRead(header,"mapped1",0,1,readLength); + GATKSAMRecord mapped2 = ArtificialSAMUtils.createArtificialRead(header,"mapped2",0,1,readLength); + GATKSAMRecord unmapped = ArtificialSAMUtils.createArtificialRead(header,"unmapped",0,1,readLength); + GATKSAMRecord allI = ArtificialSAMUtils.createArtificialRead(header,"allI",0,1,readLength); + + unmapped.setReadUnmappedFlag(true); + unmapped.setCigarString("*"); + allI.setCigarString(readLength + "I"); + + List reads = Arrays.asList(mapped1, unmapped, allI, mapped2); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads,createTestReadProperties(DownsamplingMethod.NONE, true)); + + Assert.assertTrue(li.hasNext()); + AlignmentContext context = li.next(); + ReadBackedPileup pileup = context.getBasePileup(); + Assert.assertEquals(pileup.depthOfCoverage(), 2, "Should see only 2 reads in pileup, even with unmapped and all I reads"); + + final List rawReads = li.transferReadsFromAllPreviousPileups(); + Assert.assertEquals(rawReads, reads, "Input and transferred read lists should be the same, and include the unmapped and all I reads"); + } + @Test(enabled = true && ! DEBUG) public void testXandEQOperators() { final byte[] bases1 = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; @@ -350,7 +377,7 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { // Arrays.asList(3)); } - @Test(enabled = true, dataProvider = "LIBSTest") + @Test(enabled = true && ! DEBUG, dataProvider = "LIBSTest") public void testLIBS(LIBSTest params) { // create the iterator by state with the fake reads and fake records final GATKSAMRecord read = params.makeRead(); @@ -406,22 +433,25 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { // // ------------------------------------------------------------ - @DataProvider(name = "LIBSKeepSubmittedReads") - public Object[][] makeLIBSKeepSubmittedReads() { + @DataProvider(name = "LIBS_ComplexPileupTests") + public Object[][] makeLIBS_ComplexPileupTests() { final List tests = new LinkedList(); - for ( final boolean doSampling : Arrays.asList(true, false) ) { - for ( final int nReadsPerLocus : Arrays.asList(1, 10) ) { + for ( final int downsampleTo : Arrays.asList(-1, 1, 2, 5, 10, 30)) { + for ( final int nReadsPerLocus : Arrays.asList(1, 10, 60) ) { for ( final int nLoci : Arrays.asList(1, 10, 25) ) { for ( final int nSamples : Arrays.asList(1, 2, 10) ) { for ( final boolean keepReads : Arrays.asList(true, false) ) { for ( final boolean grabReadsAfterEachCycle : Arrays.asList(true, false) ) { -// for ( final int nReadsPerLocus : Arrays.asList(1) ) { -// for ( final int nLoci : Arrays.asList(1) ) { -// for ( final int nSamples : Arrays.asList(1) ) { -// for ( final boolean keepReads : Arrays.asList(true) ) { -// for ( final boolean grabReadsAfterEachCycle : Arrays.asList(true) ) { - tests.add(new Object[]{nReadsPerLocus, nLoci, nSamples, keepReads, grabReadsAfterEachCycle, doSampling}); +// for ( final int downsampleTo : Arrays.asList(1)) { +// for ( final int nReadsPerLocus : Arrays.asList(1) ) { +// for ( final int nLoci : Arrays.asList(1) ) { +// for ( final int nSamples : Arrays.asList(1) ) { +// for ( final boolean keepReads : Arrays.asList(true) ) { +// for ( final boolean grabReadsAfterEachCycle : Arrays.asList(true) ) { + tests.add(new Object[]{nReadsPerLocus, nLoci, nSamples, + keepReads, grabReadsAfterEachCycle, + downsampleTo}); } } } @@ -432,37 +462,29 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { return tests.toArray(new Object[][]{}); } - @Test(enabled = true && ! DEBUG, dataProvider = "LIBSKeepSubmittedReads") - public void testLIBSKeepSubmittedReads(final int nReadsPerLocus, - final int nLoci, - final int nSamples, - final boolean keepReads, - final boolean grabReadsAfterEachCycle, - final boolean downsample) { - logger.warn(String.format("testLIBSKeepSubmittedReads %d %d %d %b %b %b", nReadsPerLocus, nLoci, nSamples, keepReads, grabReadsAfterEachCycle, downsample)); + @Test(enabled = true && ! DEBUG, dataProvider = "LIBS_ComplexPileupTests") + public void testLIBS_ComplexPileupTests(final int nReadsPerLocus, + final int nLoci, + final int nSamples, + final boolean keepReads, + final boolean grabReadsAfterEachCycle, + final int downsampleTo) { + //logger.warn(String.format("testLIBSKeepSubmittedReads %d %d %d %b %b %b", nReadsPerLocus, nLoci, nSamples, keepReads, grabReadsAfterEachCycle, downsample)); final int readLength = 10; - final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 100000); - final List samples = new ArrayList(nSamples); - for ( int i = 0; i < nSamples; i++ ) { - final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("rg" + i); - final String sample = "sample" + i; - samples.add(sample); - rg.setSample(sample); - rg.setPlatform(NGSPlatform.ILLUMINA.getDefaultPlatform()); - header.addReadGroup(rg); - } - - final int maxCoveragePerSampleAtLocus = nReadsPerLocus * readLength / 2; - final int maxDownsampledCoverage = Math.max(maxCoveragePerSampleAtLocus / 2, 1); + final boolean downsample = downsampleTo != -1; final DownsamplingMethod downsampler = downsample - ? new DownsamplingMethod(DownsampleType.BY_SAMPLE, maxDownsampledCoverage, null, false) + ? new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsampleTo, null, false) : new DownsamplingMethod(DownsampleType.NONE, null, null, false); - final List reads = ArtificialSAMUtils.createReadStream(nReadsPerLocus, nLoci, header, 1, readLength); + + final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(header.getSequenceDictionary(), nReadsPerLocus, nLoci); + bamBuilder.createAndSetHeader(nSamples).setReadLength(readLength).setAlignmentStart(1); + + final List reads = bamBuilder.makeReads(); li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()), createTestReadProperties(downsampler, keepReads), genomeLocParser, - samples); + bamBuilder.getSamples()); final Set seenSoFar = new HashSet(); final Set keptReads = new HashSet(); @@ -472,6 +494,8 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { final AlignmentContext alignmentContext = li.next(); final ReadBackedPileup p = alignmentContext.getBasePileup(); + AssertWellOrderedPileup(p); + if ( downsample ) { // just not a safe test //Assert.assertTrue(p.getNumberOfElements() <= maxDownsampledCoverage * nSamples, "Too many reads at locus after downsampling"); @@ -480,22 +504,29 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { Assert.assertTrue(p.getNumberOfElements() >= minPileupSize); } + // the number of reads starting here + int nReadsStartingHere = 0; + for ( final GATKSAMRecord read : p.getReads() ) + if ( read.getAlignmentStart() == alignmentContext.getPosition() ) + nReadsStartingHere++; + + // we can have no more than maxDownsampledCoverage per sample + final int maxCoveragePerLocus = downsample ? downsampleTo : nReadsPerLocus; + Assert.assertTrue(nReadsStartingHere <= maxCoveragePerLocus * nSamples); + seenSoFar.addAll(p.getReads()); if ( keepReads && grabReadsAfterEachCycle ) { final List locusReads = li.transferReadsFromAllPreviousPileups(); - // the number of reads starting here - int nReadsStartingHere = 0; - for ( final GATKSAMRecord read : p.getReads() ) - if ( read.getAlignmentStart() == alignmentContext.getPosition() ) - nReadsStartingHere++; - if ( downsample ) + if ( downsample ) { // with downsampling we might have some reads here that were downsampled away - // in the pileup + // in the pileup. We want to ensure that no more than the max coverage per sample is added Assert.assertTrue(locusReads.size() >= nReadsStartingHere); - else + Assert.assertTrue(locusReads.size() <= maxCoveragePerLocus * nSamples); + } else { Assert.assertEquals(locusReads.size(), nReadsStartingHere); + } keptReads.addAll(locusReads); // check that all reads we've seen so far are in our keptReads @@ -543,6 +574,116 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { for ( final GATKSAMRecord read : seenSoFar ) { Assert.assertTrue(keptReads.contains(read), "A read that appeared in a pileup wasn't found in the kept reads: " + read); } + + if ( ! downsample ) { + // check that every read in the list of keep reads occurred at least once in one of the pileups + for ( final GATKSAMRecord keptRead : keptReads ) { + Assert.assertTrue(seenSoFar.contains(keptRead), "There's a read " + keptRead + " in our keptReads list that never appeared in any pileup"); + } + } + } + } + + private void AssertWellOrderedPileup(final ReadBackedPileup pileup) { + if ( ! pileup.isEmpty() ) { + int leftMostPos = -1; + + for ( final PileupElement pe : pileup ) { + Assert.assertTrue(pileup.getLocation().getContig().equals(pe.getRead().getReferenceName()), "ReadBackedPileup contains an element " + pe + " that's on a different contig than the pileup itself"); + Assert.assertTrue(pe.getRead().getAlignmentStart() >= leftMostPos, + "ReadBackedPileup contains an element " + pe + " whose read's alignment start " + pe.getRead().getAlignmentStart() + + " occurs before the leftmost position we've seen previously " + leftMostPos); + } + } + } + + // --------------------------------------------------------------------------- + // make sure that downsampling isn't holding onto a bazillion reads + // + @DataProvider(name = "LIBS_NotHoldingTooManyReads") + public Object[][] makeLIBS_NotHoldingTooManyReads() { + final List tests = new LinkedList(); + + for ( final int downsampleTo : Arrays.asList(1, 10)) { + for ( final int nReadsPerLocus : Arrays.asList(100, 1000, 10000, 100000) ) { + for ( final int payloadInBytes : Arrays.asList(0, 1024, 1024*1024) ) { + tests.add(new Object[]{nReadsPerLocus, downsampleTo, payloadInBytes}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true && ! DEBUG, dataProvider = "LIBS_NotHoldingTooManyReads") +// @Test(enabled = true, dataProvider = "LIBS_NotHoldingTooManyReads", timeOut = 100000) + public void testLIBS_NotHoldingTooManyReads(final int nReadsPerLocus, final int downsampleTo, final int payloadInBytes) { + logger.warn(String.format("testLIBS_NotHoldingTooManyReads %d %d %d", nReadsPerLocus, downsampleTo, payloadInBytes)); + final int readLength = 10; + + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 100000); + final int nSamples = 1; + final List samples = new ArrayList(nSamples); + for ( int i = 0; i < nSamples; i++ ) { + final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("rg" + i); + final String sample = "sample" + i; + samples.add(sample); + rg.setSample(sample); + rg.setPlatform(NGSPlatform.ILLUMINA.getDefaultPlatform()); + header.addReadGroup(rg); + } + + final boolean downsample = downsampleTo != -1; + final DownsamplingMethod downsampler = downsample + ? new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsampleTo, null, false) + : new DownsamplingMethod(DownsampleType.NONE, null, null, false); + + // final List reads = ArtificialSAMUtils.createReadStream(nReadsPerLocus, nLoci, header, 1, readLength); + + final WeakReadTrackingIterator iterator = new WeakReadTrackingIterator(nReadsPerLocus, readLength, payloadInBytes, header); + + li = new LocusIteratorByState(iterator, + createTestReadProperties(downsampler, false), + genomeLocParser, + samples); + + while ( li.hasNext() ) { + final AlignmentContext next = li.next(); + Assert.assertTrue(next.getBasePileup().getNumberOfElements() <= downsampleTo, "Too many elements in pileup " + next); + // TODO -- assert that there are <= X reads in memory after GC for some X + } + } + + private static class WeakReadTrackingIterator implements Iterator { + final int nReads, readLength, payloadInBytes; + int readI = 0; + final SAMFileHeader header; + + private WeakReadTrackingIterator(int nReads, int readLength, final int payloadInBytes, final SAMFileHeader header) { + this.nReads = nReads; + this.readLength = readLength; + this.header = header; + this.payloadInBytes = payloadInBytes; + } + + @Override public boolean hasNext() { return readI < nReads; } + @Override public void remove() { throw new UnsupportedOperationException("no remove"); } + + @Override + public GATKSAMRecord next() { + readI++; + return makeRead(); + } + + private GATKSAMRecord makeRead() { + final SAMReadGroupRecord rg = header.getReadGroups().get(0); + final String readName = String.format("%s.%d.%s", "read", readI, rg.getId()); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, readName, 0, 1, readLength); + read.setReadGroup(new GATKSAMReadGroupRecord(rg)); + if ( payloadInBytes > 0 ) + // add a payload byte array to push memory use per read even higher + read.setAttribute("PL", new byte[payloadInBytes]); + return read; } } } diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/ReadStateManagerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManagerUnitTest.java similarity index 91% rename from public/java/test/org/broadinstitute/sting/utils/locusiterator/ReadStateManagerUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManagerUnitTest.java index 1db0605c7..b9f2fb29a 100644 --- a/public/java/test/org/broadinstitute/sting/utils/locusiterator/ReadStateManagerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManagerUnitTest.java @@ -38,11 +38,7 @@ import java.util.*; /** * testing of the new (non-legacy) version of LocusIteratorByState */ -public class ReadStateManagerUnitTest extends LocusIteratorByStateBaseTest { - /////////////////////////////////////// - // Read State Manager Tests // - /////////////////////////////////////// - +public class PerSampleReadStateManagerUnitTest extends LocusIteratorByStateBaseTest { private class PerSampleReadStateManagerTest extends TestDataProvider { private List readCountsPerAlignmentStart; private List reads; @@ -63,15 +59,12 @@ public class ReadStateManagerUnitTest extends LocusIteratorByStateBaseTest { } public void run() { - final List samples = LocusIteratorByState.sampleListForSAMWithoutReadGroups(); - final Iterator iterator = new LinkedList().iterator(); - ReadStateManager readStateManager = new ReadStateManager(iterator, samples, LIBSDownsamplingInfo.NO_DOWNSAMPLING, false); - ReadStateManager.PerSampleReadStateManager perSampleReadStateManager = readStateManager.new PerSampleReadStateManager(LIBSDownsamplingInfo.NO_DOWNSAMPLING); + PerSampleReadStateManager perSampleReadStateManager = new PerSampleReadStateManager(LIBSDownsamplingInfo.NO_DOWNSAMPLING); makeReads(); for ( ArrayList stackRecordStates : recordStatesByAlignmentStart ) { - perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates); + perSampleReadStateManager.addStatesAtNextAlignmentStart(new LinkedList(stackRecordStates)); } // read state manager should have the right number of reads diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java new file mode 100644 index 000000000..2a638eb69 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; +import org.apache.commons.collections.IteratorUtils; +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; + +/** + * Created with IntelliJ IDEA. + * User: depristo + * Date: 1/15/13 + * Time: 3:49 PM + * To change this template use File | Settings | File Templates. + */ +public class ArtificialBAMBuilderUnitTest extends BaseTest { + @DataProvider(name = "ArtificialBAMBuilderUnitTestProvider") + public Object[][] makeArtificialBAMBuilderUnitTestProvider() { + final List tests = new LinkedList(); + + final List starts = Arrays.asList( + 1, // very start of the chromosome + ArtificialBAMBuilder.BAM_SHARD_SIZE - 100, // right before the shard boundary + ArtificialBAMBuilder.BAM_SHARD_SIZE + 100 // right after the shard boundary + ); + + for ( final int readLength : Arrays.asList(10, 20) ) { + for ( final int skips : Arrays.asList(0, 1, 10) ) { + for ( final int start : starts ) { + for ( final int nSamples : Arrays.asList(1, 2) ) { + for ( final int nReadsPerLocus : Arrays.asList(1, 10) ) { + for ( final int nLoci : Arrays.asList(10, 100, 1000) ) { + final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(nReadsPerLocus, nLoci); + bamBuilder.setReadLength(readLength); + bamBuilder.setSkipNLoci(skips); + bamBuilder.setAlignmentStart(start); + bamBuilder.createAndSetHeader(nSamples); + tests.add(new Object[]{bamBuilder, readLength, skips, start, nSamples, nReadsPerLocus, nLoci}); + } + } + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ArtificialBAMBuilderUnitTestProvider") + public void testBamProvider(final ArtificialBAMBuilder bamBuilder, int readLength, int skips, int start, int nSamples, int nReadsPerLocus, int nLoci) { + Assert.assertEquals(bamBuilder.getReadLength(), readLength); + Assert.assertEquals(bamBuilder.getSkipNLoci(), skips); + Assert.assertEquals(bamBuilder.getAlignmentStart(), start); + Assert.assertEquals(bamBuilder.getNSamples(), nSamples); + Assert.assertEquals(bamBuilder.getnReadsPerLocus(), nReadsPerLocus); + Assert.assertEquals(bamBuilder.getnLoci(), nLoci); + + final List reads = bamBuilder.makeReads(); + Assert.assertEquals(reads.size(), bamBuilder.expectedNumberOfReads()); + for ( final GATKSAMRecord read : reads ) { + assertGoodRead(read, bamBuilder); + } + + final File bam = bamBuilder.makeTemporarilyBAMFile(); + final SAMFileReader reader = new SAMFileReader(bam); + Assert.assertTrue(reader.hasIndex()); + final Iterator bamIt = reader.iterator(); + int nReadsFromBam = 0; + int lastStart = -1; + while ( bamIt.hasNext() ) { + final SAMRecord read = bamIt.next(); + assertGoodRead(read, bamBuilder); + nReadsFromBam++; + Assert.assertTrue(read.getAlignmentStart() >= lastStart); + lastStart = read.getAlignmentStart(); + } + Assert.assertEquals(nReadsFromBam, bamBuilder.expectedNumberOfReads()); + } + + private void assertGoodRead(final SAMRecord read, final ArtificialBAMBuilder bamBuilder) { + Assert.assertEquals(read.getReadLength(), bamBuilder.getReadLength()); + Assert.assertEquals(read.getReadBases().length, bamBuilder.getReadLength()); + Assert.assertEquals(read.getBaseQualities().length, bamBuilder.getReadLength()); + Assert.assertTrue(read.getAlignmentStart() >= bamBuilder.getAlignmentStart()); + Assert.assertNotNull(read.getReadGroup()); + } +} + + diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java index 71c7d1bb0..4194aa6d5 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java @@ -40,7 +40,7 @@ public class ReadUtilsUnitTest extends BaseTest { final int mateStart = 1000; final int BEFORE = mateStart - 2; final int AFTER = mateStart + 2; - Integer myStart, boundary; + int myStart, boundary; GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, cigar); read.setMateAlignmentStart(mateStart); @@ -51,43 +51,43 @@ public class ReadUtilsUnitTest extends BaseTest { read.setAlignmentStart(myStart); read.setReadNegativeStrandFlag(false); boundary = ReadUtils.getAdaptorBoundary(read); - Assert.assertEquals(boundary.intValue(), myStart + fragmentSize + 1); + Assert.assertEquals(boundary, myStart + fragmentSize + 1); // Test case 2: positive strand, second read myStart = AFTER; read.setAlignmentStart(myStart); read.setReadNegativeStrandFlag(false); boundary = ReadUtils.getAdaptorBoundary(read); - Assert.assertEquals(boundary.intValue(), myStart + fragmentSize + 1); + Assert.assertEquals(boundary, myStart + fragmentSize + 1); // Test case 3: negative strand, second read myStart = AFTER; read.setAlignmentStart(myStart); read.setReadNegativeStrandFlag(true); boundary = ReadUtils.getAdaptorBoundary(read); - Assert.assertEquals(boundary.intValue(), mateStart - 1); + Assert.assertEquals(boundary, mateStart - 1); // Test case 4: negative strand, first read myStart = BEFORE; read.setAlignmentStart(myStart); read.setReadNegativeStrandFlag(true); boundary = ReadUtils.getAdaptorBoundary(read); - Assert.assertEquals(boundary.intValue(), mateStart - 1); + Assert.assertEquals(boundary, mateStart - 1); // Test case 5: mate is mapped to another chromosome (test both strands) read.setInferredInsertSize(0); read.setReadNegativeStrandFlag(true); boundary = ReadUtils.getAdaptorBoundary(read); - Assert.assertNull(boundary); + Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); read.setReadNegativeStrandFlag(false); boundary = ReadUtils.getAdaptorBoundary(read); - Assert.assertNull(boundary); + Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); read.setInferredInsertSize(10); // Test case 6: read is unmapped read.setReadUnmappedFlag(true); boundary = ReadUtils.getAdaptorBoundary(read); - Assert.assertNull(boundary); + Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); read.setReadUnmappedFlag(false); // Test case 7: reads don't overlap and look like this: @@ -99,7 +99,7 @@ public class ReadUtilsUnitTest extends BaseTest { read.setInferredInsertSize(20); read.setReadNegativeStrandFlag(true); boundary = ReadUtils.getAdaptorBoundary(read); - Assert.assertNull(boundary); + Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); // second read: myStart = 1000; @@ -107,6 +107,6 @@ public class ReadUtilsUnitTest extends BaseTest { read.setMateAlignmentStart(980); read.setReadNegativeStrandFlag(false); boundary = ReadUtils.getAdaptorBoundary(read); - Assert.assertNull(boundary); + Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); } } diff --git a/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java b/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java index 372d13a7a..37627204f 100644 --- a/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java @@ -50,6 +50,21 @@ public class BaseUtilsUnitTest extends BaseTest { Assert.assertTrue(MathUtils.compareDoubles(fraction, expected) == 0); } + @Test + public void testConvertIUPACtoN() { + + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'A', 'A'}, false, false), new byte[]{'A', 'A', 'A'}); + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'W', 'A', 'A'}, false, false), new byte[]{'N', 'A', 'A'}); + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'M', 'A'}, false, false), new byte[]{'A', 'N', 'A'}); + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'A', 'K'}, false, false), new byte[]{'A', 'A', 'N'}); + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'M', 'M', 'M'}, false, false), new byte[]{'N', 'N', 'N'}); + } + + private void checkBytesAreEqual(final byte[] b1, final byte[] b2) { + for ( int i = 0; i < b1.length; i++ ) + Assert.assertEquals(b1[i], b2[i]); + } + @Test public void testTransitionTransversion() { logger.warn("Executing testTransitionTransversion"); diff --git a/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java index 49720d1f6..03d6f457f 100644 --- a/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java @@ -154,9 +154,9 @@ public class GenotypeLikelihoodsUnitTest { public void testGetQualFromLikelihoodsMultiAllelic() { GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(triAllelic); - Allele ref = Allele.create(BaseUtils.A,true); - Allele alt1 = Allele.create(BaseUtils.C); - Allele alt2 = Allele.create(BaseUtils.T); + Allele ref = Allele.create(BaseUtils.Base.A.base,true); + Allele alt1 = Allele.create(BaseUtils.Base.C.base); + Allele alt2 = Allele.create(BaseUtils.Base.T.base); List allAlleles = Arrays.asList(ref,alt1,alt2); List gtAlleles = Arrays.asList(alt1,alt2); GenotypeBuilder gtBuilder = new GenotypeBuilder(); diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFile.scala b/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFile.scala index 28be82136..23a99b586 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFile.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFile.scala @@ -27,6 +27,8 @@ package org.broadinstitute.sting.queue.util import java.io.File import org.broadinstitute.sting.utils.io.FileExtension +import java.util.Date +import java.net.URL /** * An extension of java.io.File that can be pulled from or pushed to a remote location. @@ -35,5 +37,6 @@ trait RemoteFile extends File with FileExtension { def pullToLocal() def pushToRemote() def deleteRemote() + def createUrl(expiration: Date): URL def remoteDescription: String }