Big cleaner changes:

1. Added a Walker to merge intervals before cleaning
2. (Almost) all Walkers can filter out 454 reads (and do by default)
3. Got rid of -all command and related pieces (time to switch to CleanedReadsInjector)



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1090 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-06-25 14:31:24 +00:00
parent 3cb6d7048e
commit 940d75171a
4 changed files with 128 additions and 42 deletions

View File

@ -3,8 +3,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.indels;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.*;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.AlignmentBlock;
@ -17,8 +16,8 @@ import java.util.List;
@WalkerName("IndelIntervals")
public class IndelIntervalWalker extends ReadWalker<IndelIntervalWalker.Interval, IndelIntervalWalker.Interval> {
@Argument(fullName="maxReadLength", shortName="maxRead", doc="max read length", required=false)
public int maxReadLength = -1;
@Argument(fullName="allow454Reads", shortName="454", doc="process 454 reads", required=false)
public boolean allow454 = false;
@Argument(fullName="minIndelsPerInterval", shortName="minIndels", doc="min indels per interval", required=false)
public int minIntervalIndelCount = 1;
@ -26,9 +25,9 @@ public class IndelIntervalWalker extends ReadWalker<IndelIntervalWalker.Interval
public boolean filter(char[] ref, SAMRecord read) {
return ( !read.getReadUnmappedFlag() && // mapped
(maxReadLength < 0 || read.getReadLength() <= maxReadLength) && // not too big
read.getMappingQuality() != 0 && // positive mapping quality
read.getAlignmentBlocks().size() > 1); // indel
read.getAlignmentBlocks().size() > 1 && // indel
(allow454 || !Utils.is454Read(read, getToolkit().getEngine().getSAMHeader())) );
}
public Interval map(char[] ref, SAMRecord read) {
@ -92,4 +91,4 @@ public class IndelIntervalWalker extends ReadWalker<IndelIntervalWalker.Interval
return indelLoc.toString();
}
}
}
}

View File

@ -17,14 +17,12 @@ import java.io.FileWriter;
@WalkerName("IntervalCleaner")
public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer> {
@Argument(fullName="maxReadLength", shortName="maxRead", doc="max read length", required=false)
public int maxReadLength = -1;
@Argument(fullName="allow454Reads", shortName="454", doc="process 454 reads", required=false)
public boolean allow454 = false;
@Argument(fullName="OutputCleaned", shortName="O", required=false, doc="Output file (sam or bam) for improved (realigned) reads")
public String OUT = null;
@Argument(fullName="OutputIndels", shortName="indels", required=false, doc="Output file (text) for the indels found")
public String OUT_INDELS = null;
@Argument(fullName="OutputAllReads", shortName="all", doc="print out all reads (otherwise, just those within the intervals)", required=false)
public boolean printAllReads = false;
@Argument(fullName="OutputCleanedReadsOnly", shortName="cleanedOnly", doc="print out cleaned reads only (otherwise, all reads within the intervals)", required=false)
public boolean cleanedReadsOnly = false;
@Argument(fullName="statisticsFile", shortName="stats", doc="print out statistics (what does or doesn't get cleaned)", required=false)
@ -39,8 +37,6 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
public int MAX_CONSENSUSES = 30;
@Argument(fullName="maxReadsForConsensuses", shortName="greedy", doc="max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage)", required=false)
public int MAX_READS_FOR_CONSENSUSES = 120;
@Argument(fullName="noOutputSorting", shortName="nosort", doc="if specified, the output bam file may be not fully sorted. Use if low on temp space", required=false)
public boolean NOSORT=false;
public static final int MAX_QUAL = 99;
@ -62,12 +58,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
SAMFileHeader header = getToolkit().getEngine().getSAMHeader();
if ( OUT != null ) {
if ( NOSORT ) {
header.setSortOrder(SortOrder.unsorted);
writer = Utils.createSAMFileWriterWithCompression(header, true, OUT, getToolkit().getBAMCompression());
} else {
writer = Utils.createSAMFileWriterWithCompression(header, false, OUT, getToolkit().getBAMCompression());
}
writer = Utils.createSAMFileWriterWithCompression(header, true, OUT, getToolkit().getBAMCompression());
readsToWrite = new TreeSet<ComparableSAMRecord>();
}
@ -104,34 +95,16 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
}
}
// do we care about reads that are not part of our intervals?
public boolean actOnNonIntervalReads() {
return printAllReads && !cleanedReadsOnly;
}
// What do we do with the reads that are not part of our intervals?
public void nonIntervalReadAction(SAMRecord read) {
if ( writer != null ) {
try {
writer.addAlignment(read);
} catch (Exception e ) {
logger.error("Failed to write read "+read.getReadName()+" aligned at "+read.getReferenceName() +":"+read.getAlignmentStart());
e.printStackTrace(out);
throw new StingException(e.getMessage());
}
}
}
public Integer map(RefMetaDataTracker tracker, String ref, LocusContext context) {
List<SAMRecord> reads = context.getReads();
ArrayList<SAMRecord> goodReads = new ArrayList<SAMRecord>();
for ( SAMRecord read : reads ) {
if ( (maxReadLength < 0 || read.getReadLength() <= maxReadLength) &&
!read.getReadUnmappedFlag() &&
if ( !read.getReadUnmappedFlag() &&
!read.getNotPrimaryAlignmentFlag() &&
!read.getDuplicateReadFlag() &&
read.getMappingQuality() != 0 &&
read.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START )
read.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START &&
(allow454 || !Utils.is454Read(read, getToolkit().getEngine().getSAMHeader())) )
goodReads.add(read);
else if ( writer != null && !cleanedReadsOnly )
readsToWrite.add(new ComparableSAMRecord(read));

View File

@ -0,0 +1,110 @@
package org.broadinstitute.sting.playground.gatk.walkers.indels;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.*;
/**
* User: ebanks
* Date: Jun 10, 2009
* Time: 2:40:19 PM
* BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT
* Software and documentation are copyright 2005 by the Broad Institute.
* All rights are reserved.
*
* Users acknowledge that this software is supplied without any warranty or support.
* The Broad Institute is not responsible for its use, misuse, or
* functionality.
*/
/**
* Merges intervals based on reads which overlap them.
*/
@WalkerName("IntervalMerger")
@Requires({DataSource.READS})
public class IntervalMergerWalker extends ReadWalker<Integer,Integer> {
@Argument(fullName="intervalsToMerge", shortName="intervals", doc="Intervals to merge", required=true)
String intervalsSource = null;
@Argument(fullName="allow454Reads", shortName="454", doc="process 454 reads", required=false)
public boolean allow454 = false;
@Argument(fullName="maxIntervalSize", shortName="maxInterval", doc="max interval size", required=false)
public int maxIntervalSize = 500;
private LinkedList<GenomeLoc> intervals;
private GenomeLoc firstInterval = null;
@Override
public void initialize() {
intervals = parseIntervals(intervalsSource);
firstInterval = intervals.removeFirst();
}
@Override
public Integer map(char[] ref, SAMRecord read) {
if ( firstInterval == null ||
(!allow454 && Utils.is454Read(read, getToolkit().getEngine().getSAMHeader())) )
return 0;
GenomeLoc loc = GenomeLocParser.createGenomeLoc(read);
// emit all intervals which we've passed
while ( loc.isPast(firstInterval) ) {
if ( firstInterval.getStop() - firstInterval.getStart() < maxIntervalSize)
out.println(firstInterval);
if ( intervals.size() > 0 ) {
firstInterval = intervals.removeFirst();
} else {
firstInterval = null;
return 0;
}
}
// if we're not yet in the first interval, we're done
if ( !loc.overlapsP(firstInterval))
return 0;
// at this point, we're in the first interval.
// now we can merge any other intervals which we overlap
while ( intervals.size() > 0 && loc.overlapsP(intervals.getFirst()) )
firstInterval.setStop(intervals.removeFirst().getStop());
return 1;
}
@Override
public Integer reduceInit() { return 0; }
@Override
public Integer reduce( Integer value, Integer accum ) {
return accum + value;
}
@Override
public void onTraversalDone( Integer value ) {
if ( firstInterval != null &&
firstInterval.getStop() - firstInterval.getStart() < maxIntervalSize)
out.println(firstInterval);
}
/**
* Load the intervals directly from the command-line or from file, as appropriate.
* Merge overlapping intervals.
* @param intervalsSource Source of intervals.
* @return a linked list of sorted, merged intervals.
*/
private LinkedList<GenomeLoc> parseIntervals(String intervalsSource) {
List<GenomeLoc> parsedIntervals = GenomeAnalysisEngine.parseIntervalRegion(intervalsSource,false);
GenomeLocSortedSet intervalSortedSet = new GenomeLocSortedSet();
for ( GenomeLoc parsedInterval : parsedIntervals )
intervalSortedSet.addRegion(parsedInterval);
return new LinkedList<GenomeLoc>( intervalSortedSet );
}
}

View File

@ -17,6 +17,8 @@ public class MismatchIntervalWalker extends LocusWalker<Pair<GenomeLoc, Boolean>
public int windowSize = 10;
@Argument(fullName="mismatchFraction", shortName="mismatch", doc="fraction of mismatching base qualities threshold", required=false)
public double mismatchThreshold = 0.15;
@Argument(fullName="allow454Reads", shortName="454", doc="process 454 reads", required=false)
public boolean allow454 = false;
private final int minReadsAtInterval = 4;
@ -32,7 +34,9 @@ public class MismatchIntervalWalker extends LocusWalker<Pair<GenomeLoc, Boolean>
int goodReads = 0, mismatchQualities = 0, totalQualities = 0;
for (int i = 0; i < reads.size(); i++) {
SAMRecord read = reads.get(i);
if ( read.getMappingQuality() == 0 || read.getAlignmentBlocks().size() > 1 )
if ( read.getMappingQuality() == 0 ||
read.getAlignmentBlocks().size() > 1 ||
(!allow454 && Utils.is454Read(read, getToolkit().getEngine().getSAMHeader())) )
continue;
goodReads++;
@ -105,4 +109,4 @@ public class MismatchIntervalWalker extends LocusWalker<Pair<GenomeLoc, Boolean>
return sum;
}
}
}