From 1bdd76c2f25af38294cae18a38780d58cae889f1 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 28 Oct 2011 09:28:12 -0400 Subject: [PATCH] These tools now use the IntervalBinding system to handle intervals instead of doing it all manually --- .../gatk/walkers/indels/IndelRealigner.java | 29 +----- .../walkers/indels/RealignedReadCounter.java | 91 ++----------------- .../indels/SomaticIndelDetectorWalker.java | 30 ++---- 3 files changed, 17 insertions(+), 133 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index ddd029e5d..56f5f11d7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -30,6 +30,7 @@ import net.sf.samtools.*; import net.sf.samtools.util.RuntimeIOException; import net.sf.samtools.util.SequenceUtil; import net.sf.samtools.util.StringUtil; +import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -45,10 +46,6 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator; -import org.broadinstitute.sting.utils.interval.IntervalMergingRule; -import org.broadinstitute.sting.utils.interval.IntervalUtils; -import org.broadinstitute.sting.utils.interval.NwayIntervalMergingIterator; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.NWaySAMFileWriter; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -144,7 +141,7 @@ public class IndelRealigner extends ReadWalker { * The interval list output from the RealignerTargetCreator tool using the same bam(s), reference, and known indel file(s). */ @Input(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true) - protected String intervalsFile = null; + protected IntervalBinding intervalsFile = null; /** * This term is equivalent to "significance" - i.e. is the improvement significant enough to merit realignment? Note that this number @@ -359,33 +356,15 @@ public class IndelRealigner extends ReadWalker { throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile,ex); } - NwayIntervalMergingIterator merger = new NwayIntervalMergingIterator(IntervalMergingRule.OVERLAPPING_ONLY); - List rawIntervals = new ArrayList(); - // separate argument on semicolon first - for (String fileOrInterval : intervalsFile.split(";")) { - // if it's a file, add items to raw interval list - if (IntervalUtils.isIntervalFile(fileOrInterval)) { - merger.add(new IntervalFileMergingIterator( getToolkit().getGenomeLocParser(), new java.io.File(fileOrInterval), IntervalMergingRule.OVERLAPPING_ONLY ) ); - } else { - rawIntervals.add(getToolkit().getGenomeLocParser().parseGenomeLoc(fileOrInterval)); - } - } - if ( ! rawIntervals.isEmpty() ) - merger.add(rawIntervals.iterator()); - - // prepare to read intervals one-by-one, as needed - intervals = merger; + intervals = intervalsFile.getIntervals(getToolkit()).iterator(); currentInterval = intervals.hasNext() ? intervals.next() : null; writerToUse = writer; if ( N_WAY_OUT != null ) { - // Map args = getToolkit().getArguments().walkerArgs; boolean createIndex = true; - // if ( args.containsKey("disable_bam_indexing") ) { System.out.println("NO INDEXING!!"); System.exit(1); createIndex = false; } - if ( N_WAY_OUT.toUpperCase().endsWith(".MAP") ) { writerToUse = new NWaySAMFileWriter(getToolkit(),loadFileNameMap(N_WAY_OUT),SAMFileHeader.SortOrder.coordinate,true, createIndex, generateMD5s); } else { @@ -557,7 +536,7 @@ public class IndelRealigner extends ReadWalker { } while ( currentInterval != null && (readLoc == null || currentInterval.isBefore(readLoc)) ); } catch (ReviewedStingException e) { - throw new UserException.MissortedFile(new File(intervalsFile), " *** Are you sure that your interval file is sorted? If not, you must use the --targetIntervalsAreNotSorted argument. ***", e); + throw new UserException.MissortedFile(new File(intervalsFile.getSource()), " *** Are you sure that your interval file is sorted? If not, you must use the --targetIntervalsAreNotSorted argument. ***", e); } sawReadInCurrentInterval = false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java index 2c89b907b..f1c1a8d9e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java @@ -25,23 +25,12 @@ package org.broadinstitute.sting.gatk.walkers.indels; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.BadMateFilter; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.By; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator; -import org.broadinstitute.sting.utils.interval.IntervalMergingRule; -import org.broadinstitute.sting.utils.sam.ReadUtils; - -import java.io.File; -import java.util.Iterator; @By(DataSource.READS) // walker to count realigned reads @@ -50,87 +39,23 @@ public class RealignedReadCounter extends ReadWalker { public static final String ORIGINAL_CIGAR_TAG = "OC"; public static final String ORIGINAL_POSITION_TAG = "OP"; - @Argument(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true) - protected String intervalsFile = null; - - // the intervals input by the user - private Iterator intervals = null; - - // the current interval in the list - private GenomeLoc currentInterval = null; - - private long updatedIntervals = 0, updatedReads = 0, affectedBases = 0; - private boolean intervalWasUpdated = false; - - public void initialize() { - // prepare to read intervals one-by-one, as needed (assuming they are sorted). - intervals = new IntervalFileMergingIterator( getToolkit().getGenomeLocParser(), new File(intervalsFile), IntervalMergingRule.OVERLAPPING_ONLY ); - currentInterval = intervals.hasNext() ? intervals.next() : null; - } + private long updatedReads = 0; public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - if ( currentInterval == null ) { - return 0; - } - GenomeLoc readLoc = ref.getGenomeLocParser().createGenomeLoc(read); - // hack to get around unmapped reads having screwy locations - if ( readLoc.getStop() == 0 ) - readLoc = ref.getGenomeLocParser().createGenomeLoc(readLoc.getContig(), readLoc.getStart(), readLoc.getStart()); - - if ( readLoc.isBefore(currentInterval) || ReadUtils.is454Read(read) ) - return 0; - - if ( readLoc.overlapsP(currentInterval) ) { - if ( doNotTryToClean(read) ) + if ( read.getAttribute(ORIGINAL_CIGAR_TAG) != null ) { + String newCigar = (String)read.getAttribute(ORIGINAL_CIGAR_TAG); + // deal with an old bug + if ( read.getCigar().toString().equals(newCigar) ) { return 0; - - if ( read.getAttribute(ORIGINAL_CIGAR_TAG) != null ) { - String newCigar = (String)read.getAttribute(ORIGINAL_CIGAR_TAG); - // deal with an old bug - if ( read.getCigar().toString().equals(newCigar) ) { - //System.out.println(currentInterval + ": " + read.getReadName() + " " + read.getCigarString() + " " + newCigar); - return 0; - } - - if ( !intervalWasUpdated ) { - intervalWasUpdated = true; - updatedIntervals++; - affectedBases += 20 + getIndelSize(read); - } - updatedReads++; - } - } else { - do { - intervalWasUpdated = false; - currentInterval = intervals.hasNext() ? intervals.next() : null; - } while ( currentInterval != null && currentInterval.isBefore(readLoc) ); + + updatedReads++; } return 0; } - private int getIndelSize(SAMRecord read) { - for ( CigarElement ce : read.getCigar().getCigarElements() ) { - if ( ce.getOperator() == CigarOperator.I ) - return 0; - if ( ce.getOperator() == CigarOperator.D ) - return ce.getLength(); - } - logger.warn("We didn't see an indel for this read: " + read.getReadName() + " " + read.getAlignmentStart() + " " + read.getCigar()); - return 0; - } - - private boolean doNotTryToClean(SAMRecord read) { - return read.getReadUnmappedFlag() || - read.getNotPrimaryAlignmentFlag() || - read.getReadFailsVendorQualityCheckFlag() || - read.getMappingQuality() == 0 || - read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START || - (BadMateFilter.hasBadMate(read)); - } - public Integer reduceInit() { return 0; } @@ -140,8 +65,6 @@ public class RealignedReadCounter extends ReadWalker { } public void onTraversalDone(Integer result) { - System.out.println(updatedIntervals + " intervals were updated"); System.out.println(updatedReads + " reads were updated"); - System.out.println(affectedBases + " bases were affected"); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java index 434cbec52..5a99f26f0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java @@ -26,10 +26,8 @@ package org.broadinstitute.sting.gatk.walkers.indels; import net.sf.samtools.*; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.Tags; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; @@ -55,7 +53,6 @@ import org.broadinstitute.sting.utils.collections.CircularArray; import org.broadinstitute.sting.utils.collections.PrimitivePair; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.interval.OverlappingIntervalIterator; @@ -135,15 +132,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker { @Hidden @Argument(fullName = "genotype_intervals", shortName = "genotype", doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or not", required = false) - public String genotypeIntervalsFile = null; - - @Hidden - @Argument(fullName="genotypeIntervalsAreNotSorted", shortName="giNotSorted", required=false, - doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+ - "if the list turns out to be unsorted, it will throw an exception. "+ - "Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+ - "to sort and keep it in memory (increases memory usage!).") - protected boolean GENOTYPE_NOT_SORTED = false; + public IntervalBinding genotypeIntervalsFile = null; @Hidden @Argument(fullName="unpaired", shortName="unpaired", @@ -365,16 +354,9 @@ public class SomaticIndelDetectorWalker extends ReadWalker { } if ( genotypeIntervalsFile != null ) { - if ( ! GENOTYPE_NOT_SORTED && IntervalUtils.isIntervalFile(genotypeIntervalsFile)) { - // prepare to read intervals one-by-one, as needed (assuming they are sorted). - genotypeIntervalIterator = new IntervalFileMergingIterator(getToolkit().getGenomeLocParser(), - new java.io.File(genotypeIntervalsFile), IntervalMergingRule.OVERLAPPING_ONLY ); - } else { - // read in the whole list of intervals for cleaning - GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(getToolkit().getGenomeLocParser(), - IntervalUtils.parseIntervalArguments(getToolkit().getGenomeLocParser(),Arrays.asList(genotypeIntervalsFile)), IntervalMergingRule.OVERLAPPING_ONLY); - genotypeIntervalIterator = locs.iterator(); - } + // read in the whole list of intervals for cleaning + GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(getToolkit().getGenomeLocParser(), genotypeIntervalsFile.getIntervals(getToolkit()), IntervalMergingRule.OVERLAPPING_ONLY); + genotypeIntervalIterator = locs.iterator(); // wrap intervals requested for genotyping inside overlapping iterator, so that we actually // genotype only on the intersections of the requested intervals with the -L intervals