1. downsample when there are too many mismatching reads (needs perfecting)

2. allow user to specify that no reads be emitted


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@974 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-06-10 19:55:42 +00:00
parent 39dcd4f11f
commit b1f90635c1
1 changed files with 43 additions and 47 deletions

View File

@ -17,8 +17,8 @@ import java.io.FileWriter;
public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer> {
@Argument(fullName="maxReadLength", shortName="maxRead", doc="max read length", required=false)
public int maxReadLength = -1;
@Argument(fullName="OutputCleaned", shortName="O", required=true, doc="Output file (sam or bam) for improved (realigned) reads")
public String OUT;
@Argument(fullName="OutputCleaned", shortName="O", required=false, doc="Output file (sam or bam) for improved (realigned) reads")
public String OUT = null;
@Argument(fullName="OutputIndels", shortName="indels", required=false, doc="Output file (text) for the indels found")
public String OUT_INDELS = null;
@Argument(fullName="OutputAllReads", shortName="all", doc="print out all reads (otherwise, just those within the intervals)", required=false)
@ -27,22 +27,20 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
public String OUT_STATS = null;
@Argument(fullName="LODThresholdForCleaning", shortName="LOD", doc="LOD threshold above which the cleaner will clean", required=false)
public double LOD_THRESHOLD = 5.0;
@Argument(fullName="maxPileSize", shortName="maxSize", doc="max number of reads in the pile; if exceeded, no attempt will be made to realign the pile", required=false)
public int maxPileSize = 1000000000;
@Argument(fullName="EntropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false)
public double MISMATCH_THRESHOLD = 0.25;
@Argument(fullName="GreedyThreshold", shortName="greedy", doc="coverage above which the cleaner turns on greedy mode to improve performance", required=false)
public int GREEDY_THRESHOLD = 500;
@Argument(fullName="GreedyThreshold", shortName="greedy", doc="coverage (of reads with mismatches only) above which the cleaner turns on greedy mode to improve performance", required=false)
public int GREEDY_THRESHOLD = 100;
public static final int MAX_QUAL = 99;
private SAMFileWriter writer;
private SAMFileWriter writer = null;
private FileWriter indelOutput = null;
private FileWriter statsOutput = null;
// we need to sort the reads ourselves because SAM headers get messed up and claim to be "unsorted" sometimes
private TreeSet<ComparableSAMRecord> readsToWrite;
private TreeSet<ComparableSAMRecord> readsToWrite = null;
public void initialize() {
@ -52,7 +50,10 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
throw new RuntimeException("Entropy threshold must be a fraction between 0 and 1");
SAMFileHeader header = getToolkit().getEngine().getSAMHeader();
writer = Utils.createSAMFileWriterWithCompression(header, false, OUT, getToolkit().getBAMCompression());
if ( OUT != null ) {
writer = Utils.createSAMFileWriterWithCompression(header, false, OUT, getToolkit().getBAMCompression());
readsToWrite = new TreeSet<ComparableSAMRecord>();
}
logger.info("Writing into output BAM file at compression level " + getToolkit().getBAMCompression());
logger.info("Temporary space used: "+System.getProperty("java.io.tmpdir"));
@ -75,7 +76,6 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
statsOutput = null;
}
}
readsToWrite = new TreeSet<ComparableSAMRecord>();
}
// do we care about reads that are not part of our intervals?
@ -85,12 +85,14 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
// What do we do with the reads that are not part of our intervals?
public void nonIntervalReadAction(SAMRecord read) {
try {
writer.addAlignment(read);
} catch (Exception e ) {
logger.error("Failed to write read "+read.getReadName()+" aligned at "+read.getReferenceName() +":"+read.getAlignmentStart());
e.printStackTrace(out);
throw new StingException(e.getMessage());
if ( writer != null ) {
try {
writer.addAlignment(read);
} catch (Exception e ) {
logger.error("Failed to write read "+read.getReadName()+" aligned at "+read.getReferenceName() +":"+read.getAlignmentStart());
e.printStackTrace(out);
throw new StingException(e.getMessage());
}
}
}
@ -105,39 +107,21 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
read.getMappingQuality() != 0 &&
read.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START )
goodReads.add(read);
else
else if ( writer != null )
readsToWrite.add(new ComparableSAMRecord(read));
}
if ( goodReads.size() > maxPileSize ) {
// too many reads, shy away!
if ( statsOutput != null ) {
try {
statsOutput.write(context.getLocation().toString());
statsOutput.write("\tSKIPPED ("+reads.size()+" reads total, "+goodReads.size()+" for realignment)\t");
statsOutput.write("-1.0");
statsOutput.write("\n");
statsOutput.flush();
} catch (Exception e) {}
}
// push all "good" reads into readsToWrite without cleaning, there are too many!
for ( SAMRecord read : goodReads ) {
readsToWrite.add(new ComparableSAMRecord(read));
}
goodReads.clear();
} else {
clean(goodReads, ref, context.getLocation());
}
clean(goodReads, ref, context.getLocation());
//bruteForceClean(goodReads, ref, context.getLocation().getStart());
//testCleanWithDeletion();
//testCleanWithInsertion();
Iterator<ComparableSAMRecord> iter = readsToWrite.iterator();
while ( iter.hasNext() )
writer.addAlignment(iter.next().getRecord());
readsToWrite.clear();
if ( writer != null ) {
Iterator<ComparableSAMRecord> iter = readsToWrite.iterator();
while ( iter.hasNext() )
writer.addAlignment(iter.next().getRecord());
readsToWrite.clear();
}
return 1;
}
@ -151,7 +135,9 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
public void onTraversalDone(Integer result) {
out.println("Saw " + result + " intervals");
writer.close();
if ( writer != null ) {
writer.close();
}
if ( OUT_INDELS != null ) {
try {
indelOutput.close();
@ -282,6 +268,14 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
}
}
// if we have too many reads with mismatches, be greedy
if ( altReads.size() > GREEDY_THRESHOLD) {
logger.debug("Downsampling from " + altReads.size() + " to " + GREEDY_THRESHOLD + " mismatching reads");
//sortByGreedy();
for ( int i = GREEDY_THRESHOLD; i < altReads.size(); i++)
altAlignmentsToTest.set(i, false);
}
Consensus bestConsensus = null;
// for each alternative consensus to test, align it to the reference and create an alternative consensus
@ -456,10 +450,12 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
}
// write them out
for ( SAMRecord rec : refReads )
readsToWrite.add(new ComparableSAMRecord(rec));
for ( AlignedRead aRec : altReads )
readsToWrite.add(new ComparableSAMRecord(aRec.getRead()));
if ( writer != null ) {
for ( SAMRecord rec : refReads )
readsToWrite.add(new ComparableSAMRecord(rec));
for ( AlignedRead aRec : altReads )
readsToWrite.add(new ComparableSAMRecord(aRec.getRead()));
}
}
private Pair<Integer, Integer> findBestOffset(String ref, AlignedRead read) {