diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelIntervalWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelIntervalWalker.java deleted file mode 100644 index da15f91de..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelIntervalWalker.java +++ /dev/null @@ -1,137 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.indels; - -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.filters.Platform454Filter; -import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.*; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.AlignmentBlock; -import net.sf.samtools.CigarElement; - -import java.util.List; - -/** - * Emits intervals consisting of indels from the aligned reads. - */ -@WalkerName("IndelIntervals") -@Requires({DataSource.READS, DataSource.REFERENCE}) -@ReadFilters({Platform454Filter.class, ZeroMappingQualityReadFilter.class}) -public class IndelIntervalWalker extends ReadWalker { - @Argument(fullName="allow454Reads", shortName="454", doc="process 454 reads", required=false) - boolean allow454 = false; - @Argument(fullName="minIndelsPerInterval", shortName="minIndels", doc="min indels per interval", required=false) - int minIntervalIndelCount = 1; - - public void initialize() {} - - public boolean filter(char[] ref, SAMRecord read) { - return ( !read.getReadUnmappedFlag() && // mapped - read.getMappingQuality() != 0 && // positive mapping quality - read.getCigarLength() > 1 && // indel - (allow454 || !Utils.is454Read(read)) ); - } - - public Interval map(char[] ref, SAMRecord read) { -// List blocks = read.getAlignmentBlocks(); -// long indelLeftEdge = read.getAlignmentStart() + blocks.get(0).getLength() - 1; -// long indelRightEdge = read.getAlignmentEnd() - blocks.get(blocks.size()-1).getLength() + 1; - - long indelLeftEdge = -1; - long indelRightEdge = -1; - int lengthOnRef = 0; // length of the event on the reference (this preset value is right for insertion) - int pos = read.getAlignmentStart(); - for ( final CigarElement ce : read.getCigar().getCigarElements() ) { - switch( ce.getOperator() ) { - case S : - case H : break; // ignore clipping completely: alignment start is set to the first NON-clipped base - case P : // skip padding and matching bases, advance position on the ref: - case M : pos += ce.getLength(); break; - case D : - lengthOnRef = ce.getLength(); // deletion occupies non-zero number of bases on the ref - case I : - // for insertion we do not have to set lengthOnRef as it was already initialized to 0 - if ( indelLeftEdge == -1 ) { // it's the first indel in the current read - indelLeftEdge = pos - 1; // set to the last (matching) base before the indel - } - pos += lengthOnRef; - indelRightEdge = pos; // whether it is the first indel in the read or not, we set right edge immediately after it - break; - default : - throw new StingException("Unrecognized cigar element '"+ce.getOperator()+"' in read "+read.getReadName()); - } - } - - // at this point indelLeftEdge == -1 or [indelLeftEdge,indelRightEdge] is the interval encompassing ALL indels in the current read: - if ( indelLeftEdge == -1 ) return null; - - // we've seen some bam files coming from other centers that have reads (and indels!!) hanging over the contig ends; - // we do not want to deal with this stuff (what is it, anyway??) - if ( indelRightEdge > GenomeLocParser.getContigInfo(read.getReferenceName()).getSequenceLength() ) { - System.out.println("WARNING: read " + read.getReadName()+" contains indel(s) spanning past the contig end (read ignored)."); - return null; - } - -// if ( indelLeftEdge == 49563377 ) System.out.println("read: " +read.getReadName()); - GenomeLoc indelLoc = GenomeLocParser.createGenomeLoc(read.getReferenceIndex(), indelLeftEdge, indelRightEdge); - GenomeLoc refLoc = GenomeLocParser.createGenomeLoc(read); - - // if ( indelLeftEdge == 10313124 || indelLeftEdge == 10313170 ) System.out.println("read: " +read.getReadName() + " ; " + refLoc + " ; " + indelLoc); - - return new Interval(refLoc, indelLoc); - } - - public Interval reduceInit() { - return null; - } - - // Note that the reduce step assumes that reductions occur in order (i.e. no hierarchical reductions), - // although this can easily be changed if necessary. - public Interval reduce(Interval value, Interval sum) { - // if there is no interval to the left, then this is the first one - if ( sum == null ) - return value; - - if ( value == null ) return sum; // nothing to do, wait for the next - //System.out.println(sum + " ==> " + value); - - // if the intervals don't overlap, print out the leftmost one and start a new one - if ( !sum.readLoc.overlapsP(value.readLoc) ) { - if ( sum.indelCount >= minIntervalIndelCount ) - out.println(sum); - return value; - } - // otherwise, merge them - return sum.merge(value); - } - - public void onTraversalDone(Interval result) { - if ( result != null && result.indelCount >= minIntervalIndelCount ) - out.println(result); - } - - public class Interval { - public GenomeLoc readLoc = null; - public GenomeLoc indelLoc = null; - public int indelCount; - - Interval(GenomeLoc readLoc, GenomeLoc indelLoc) { - this.indelLoc = indelLoc; - this.readLoc = readLoc; - indelCount = 1; - } - - public Interval merge(Interval i) { - long indelLeftEdge = Math.min(this.indelLoc.getStart(), i.indelLoc.getStart()); - long indelRightEdge = Math.max(this.indelLoc.getStop(), i.indelLoc.getStop()); - GenomeLoc mergedIndelLoc = GenomeLocParser.createGenomeLoc(this.indelLoc.getContigIndex(), indelLeftEdge, indelRightEdge); - Interval mergedInterval = new Interval(this.readLoc.merge(i.readLoc), mergedIndelLoc); - mergedInterval.indelCount = this.indelCount + i.indelCount; - return mergedInterval; - } - - public String toString() { - return indelLoc.toString(); - } - } -} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalMergerWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalMergerWalker.java deleted file mode 100755 index 0818be1c9..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalMergerWalker.java +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2009 The Broad Institute - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.indels; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.filters.Platform454Filter; -import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.cmdLine.Argument; - -import java.util.*; - - -/** - * Merges sets of intervals based on reads which overlap them; no two intervals in the final list - * will have a read that even partially spans them both. A maximum interval size is enforced though. - */ -@WalkerName("IntervalMerger") -@Requires({DataSource.READS, DataSource.REFERENCE}) -@ReadFilters({Platform454Filter.class, ZeroMappingQualityReadFilter.class}) -public class IntervalMergerWalker extends ReadWalker { - - @Argument(fullName="intervalsToMerge", shortName="intervals", doc="Intervals to merge", required=true) - List intervalsSource = null; - @Argument(fullName="allow454Reads", shortName="454", doc="process 454 reads", required=false) - boolean allow454 = false; - @Argument(fullName="maxIntervalSize", shortName="maxInterval", doc="max interval size", required=false) - int maxIntervalSize = 500; - - private LinkedList intervals; - private GenomeLoc currentInterval = null; - private boolean currentIntervalIsUsed = false; - - @Override - public void initialize() { - intervals = new LinkedList(GenomeLocParser.parseIntervals(intervalsSource, GenomeAnalysisEngine.instance.getArguments().intervalMerging)); - currentInterval = (intervals.size() > 0 ? intervals.removeFirst() : null); - } - - @Override - public Integer map(char[] ref, SAMRecord read) { - if ( currentInterval == null || - (!allow454 && Utils.is454Read(read)) ) - return 0; - - GenomeLoc loc = GenomeLocParser.createGenomeLoc(read); - - // ignore all intervals which we've passed - while ( loc.isPast(currentInterval) ) { - if ( currentIntervalIsUsed && (maxIntervalSize <= 0 || currentInterval.getStop() - currentInterval.getStart() <= maxIntervalSize) ) { - out.println(currentInterval); - currentIntervalIsUsed = false; - } - if ( intervals.size() > 0 ) { - currentInterval = intervals.removeFirst(); - } else { - currentInterval = null; - return 0; - } - } - - // if we're not yet in the current interval, we're done - if ( !loc.overlapsP(currentInterval)) - return 0; - - // at this point, we're in the current interval. - // now we can merge any other intervals which we overlap - currentIntervalIsUsed = true; - while ( intervals.size() > 0 && loc.overlapsP(intervals.getFirst()) ) - currentInterval = GenomeLocParser.setStop(currentInterval, intervals.removeFirst().getStop()); - - return 1; - } - - @Override - public Integer reduceInit() { return 0; } - - @Override - public Integer reduce( Integer value, Integer accum ) { - return accum + value; - } - - @Override - public void onTraversalDone( Integer value ) { - if ( currentInterval != null && currentIntervalIsUsed && - (maxIntervalSize <= 0 || currentInterval.getStop() - currentInterval.getStart() <= maxIntervalSize) ) - out.println(currentInterval); - } -} - diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/MismatchIntervalWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/MismatchIntervalWalker.java deleted file mode 100755 index dca4b9bff..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/MismatchIntervalWalker.java +++ /dev/null @@ -1,120 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.indels; - -import net.sf.samtools.*; -import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.Platform454Filter; -import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.gatk.walkers.WalkerName; -import org.broadinstitute.sting.gatk.walkers.ReadFilters; -import org.broadinstitute.sting.utils.cmdLine.Argument; - -import java.util.*; - -/** - * Emits intervals consisting of nearby loci with high mismatch rates. - */ -@WalkerName("MismatchIntervals") -@ReadFilters({Platform454Filter.class, ZeroMappingQualityReadFilter.class}) -public class MismatchIntervalWalker extends LocusWalker, Pair, GenomeLoc>> { - @Argument(fullName="windowSize", shortName="window", doc="window size for calculating entropy", required=false) - int windowSize = 10; - @Argument(fullName="mismatchFraction", shortName="mismatch", doc="fraction of mismatching base qualities threshold", required=false) - double mismatchThreshold = 0.15; - @Argument(fullName="allow454Reads", shortName="454", doc="process 454 reads", required=false) - boolean allow454 = false; - - private final int minReadsAtInterval = 4; - - public void initialize() { - if ( windowSize < 1) - throw new RuntimeException("Window Size must be a positive integer"); - } - - public Pair map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - char upperRef = Character.toUpperCase(ref.getBase()); - List reads = context.getReads(); - List offsets = context.getOffsets(); - int goodReads = 0, mismatchQualities = 0, totalQualities = 0; - for (int i = 0; i < reads.size(); i++) { - SAMRecord read = reads.get(i); - if ( read.getMappingQuality() == 0 || - read.getAlignmentBlocks().size() > 1 || - (!allow454 && Utils.is454Read(read)) ) - continue; - - goodReads++; - int offset = offsets.get(i); - int quality = (int)read.getBaseQualityString().charAt(offset) - 33; - totalQualities += quality; - - char base = Character.toUpperCase((char)read.getReadBases()[offset]); - if ( base != upperRef ) - mismatchQualities += quality; - } - - boolean flag = false; - if ( goodReads >= minReadsAtInterval && (double)mismatchQualities / (double)totalQualities > mismatchThreshold ) - flag = true; - return new Pair(context.getLocation(), flag); - } - - public void onTraversalDone(Pair, GenomeLoc> sum) { - if (sum.second != null) - out.println(sum.second); - } - - public Pair, GenomeLoc> reduceInit() { - return new Pair, GenomeLoc>(new LinkedList(), null); - } - - public Pair, GenomeLoc> reduce(Pair value, Pair, GenomeLoc> sum) { - // if we hit a new contig, clear the list - if ( sum.second != null && sum.second.getContigIndex() != value.first.getContigIndex() ) { - out.println(sum.second); - sum.first.clear(); - sum.second = null; - } - - sum.first.addLast(value.second); - if ( sum.first.size() <= windowSize ) - return sum; - - sum.first.remove(); - if ( !value.second ) - return sum; - - int mismatches = 0; - int firstMismatch = -1; - for (int i = 0; i < windowSize; i++) { - if ( sum.first.get(i) ) { - mismatches++; - if ( firstMismatch == -1 ) - firstMismatch = i; - } - } - - if ( mismatches > 1 ) { - // if there is no interval to the left, then this is the first one - if ( sum.second == null ) { - sum.second = value.first; - sum.second = GenomeLocParser.setStart(sum.second, sum.second.getStart() - windowSize + firstMismatch + 1); - } - // if the intervals don't overlap, print out the leftmost one and start a new one - else if ( value.first.getStop() - sum.second.getStop() > windowSize ) { - out.println(sum.second); - sum.second = value.first; - sum.second = GenomeLocParser.setStart(sum.second,sum.second.getStart() - windowSize + firstMismatch + 1); - } - // otherwise, merge them - else { - sum.second = GenomeLocParser.setStop(sum.second, value.first.getStop()); - } - } - - return sum; - } -} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java index df630a16c..c2838f115 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java @@ -15,30 +15,24 @@ import org.broadinstitute.sting.utils.cmdLine.Argument; import java.util.*; /** - * Emits intervals for the Local Indel Realigner to target for cleaning. Ignores 454 reads. + * Emits intervals for the Local Indel Realigner to target for cleaning. Ignores 454 and MQ0 reads. */ @ReadFilters({Platform454Filter.class, ZeroMappingQualityReadFilter.class}) public class RealignerTargetCreator extends LocusWalker { - // mismatch/entropy arguments + // mismatch/entropy/SNP arguments @Argument(fullName="windowSize", shortName="window", doc="window size for calculating entropy or SNP clusters", required=false) protected int windowSize = 10; @Argument(fullName="mismatchFraction", shortName="mismatch", doc="fraction of base qualities needing to mismatch for a position to have high entropy; to disable set to <= 0 or > 1", required=false) protected double mismatchThreshold = 0.15; - - // observed indels arguments - @Argument(fullName="minIndelsPerInterval", shortName="minIndels", doc="min indels per interval", required=false) - int minIntervalIndelCount = 1; - + @Argument(fullName="minReadsAtLocus", shortName="minReads", doc="minimum reads at a locus to enable using the entropy calculation", required=false) + protected int minReadsAtLocus = 4; // interval merging arguments - @Argument(fullName="maxIntervalSize", shortName="maxInterval", doc="max interval size", required=false) - int maxIntervalSize = 500; - - - private final int minReadsAtInterval = 4; + @Argument(fullName="maxIntervalSize", shortName="maxInterval", doc="maximum interval size", required=false) + protected int maxIntervalSize = 500; @Override public boolean generateExtendedEvents() { return true; } @@ -115,7 +109,7 @@ public class RealignerTargetCreator extends LocusWalker 0.0 && mismatchThreshold <= 1.0 && - pileup.size() >= minReadsAtInterval && + pileup.size() >= minReadsAtLocus && (double)mismatchQualities / (double)totalQualities >= mismatchThreshold ) hasPointEvent = true; } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/SNPClusterWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/SNPClusterWalker.java deleted file mode 100755 index 30ec875de..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/SNPClusterWalker.java +++ /dev/null @@ -1,68 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.indels; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.VariationRod; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.genotype.Variation; - -/** - * Given a ROD track SNP calls called "snps", emits intervals consisting of clustered SNPs. - */ -@WalkerName("SNPClusters") -@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="snps",type= VariationRod.class)}) -public class SNPClusterWalker extends RefWalker { - @Argument(fullName="windowSize", shortName="window", doc="window size for calculating clusters", required=false) - int windowSize = 10; - - public void initialize() { - if ( windowSize < 1) - throw new RuntimeException("Window Size must be a positive integer"); - } - - public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - Variation snp = (Variation)tracker.lookup("snps", null); - return (snp != null && snp.isBiallelic() && snp.isSNP()) ? context.getLocation() : null; - } - - public void onTraversalDone(GenomeLoc sum) { - if ( sum != null && sum.getStart() != sum.getStop() ) - out.println(sum); - } - - public GenomeLoc reduceInit() { - return null; - } - - public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) { - // ignore non-SNP variants - if ( value == null ) - return sum; - - // if we have no previous SNPs start with the new location - if ( sum == null ) - return value; - - // if we hit a new contig, emit and start with the new location - if ( sum.getContigIndex() != value.getContigIndex() ) { - if ( sum.getStart() != sum.getStop() ) - out.println(sum); - return value; - } - - // if the last SNP location was within a window, merge them - if ( value.getStart() - sum.getStop() <= windowSize ) { - sum = GenomeLocParser.setStop(sum,value.getStart()); - return sum; - } - - // otherwise, emit and start with the new location - if ( sum.getStart() != sum.getStop() ) - out.println(sum); - return value; - } -} diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/indels/IntervalsIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/indels/IntervalsIntegrationTest.java index 52067b1b8..c861ee19d 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/indels/IntervalsIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/indels/IntervalsIntegrationTest.java @@ -10,22 +10,15 @@ public class IntervalsIntegrationTest extends WalkerTest { public void testIntervals() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - "-T IndelIntervals -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -o %s", + "-T RealignerTargetCreator -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s", 1, - Arrays.asList("76f97b91921f427ab639b6b8228ac4dc")); - executeTest("testIndelIntervals", spec1); + Arrays.asList("d21e83a8b0d3f63acd9ca3b0b636e515")); + executeTest("test standard", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( - "-T MismatchIntervals -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -o %s", + "-T RealignerTargetCreator -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_b36.rod -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s", 1, - Arrays.asList("31e8b5d4c42f2c63c08b8f6b8e10ac99")); - executeTest("testMismatchIntervals", spec2); - - WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( - "-T IntervalMerger -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -intervals " + validationDataLocation + "indelIntervals.test -intervals " + validationDataLocation + "mismatchIntervals.test -o %s", - 1, - Arrays.asList("bf1f23667ef0065bbcb9754f50c2d664")); - executeTest("testMergeIntervals", spec3); - + Arrays.asList("bfccfa50f62d10ee2fe8cfa68fb70002")); + executeTest("test dbsnp", spec2); } }