From b1f90635c17b6a516c0bb926ac13f6f5eae7d2d1 Mon Sep 17 00:00:00 2001 From: ebanks Date: Wed, 10 Jun 2009 19:55:42 +0000 Subject: [PATCH] 1. downsample when there are too many mismatching reads (needs perfecting) 2. allow user to specify that no reads be emitted git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@974 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/indels/IntervalCleanerWalker.java | 90 +++++++++---------- 1 file changed, 43 insertions(+), 47 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java index 1281a6de4..46e74b192 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java @@ -17,8 +17,8 @@ import java.io.FileWriter; public class IntervalCleanerWalker extends LocusWindowWalker { @Argument(fullName="maxReadLength", shortName="maxRead", doc="max read length", required=false) public int maxReadLength = -1; - @Argument(fullName="OutputCleaned", shortName="O", required=true, doc="Output file (sam or bam) for improved (realigned) reads") - public String OUT; + @Argument(fullName="OutputCleaned", shortName="O", required=false, doc="Output file (sam or bam) for improved (realigned) reads") + public String OUT = null; @Argument(fullName="OutputIndels", shortName="indels", required=false, doc="Output file (text) for the indels found") public String OUT_INDELS = null; @Argument(fullName="OutputAllReads", shortName="all", doc="print out all reads (otherwise, just those within the intervals)", required=false) @@ -27,22 +27,20 @@ public class IntervalCleanerWalker extends LocusWindowWalker public String OUT_STATS = null; @Argument(fullName="LODThresholdForCleaning", shortName="LOD", doc="LOD threshold above which the cleaner will clean", required=false) public double LOD_THRESHOLD = 5.0; - @Argument(fullName="maxPileSize", shortName="maxSize", doc="max number of reads in the pile; if exceeded, no attempt will be made to realign the pile", required=false) - public int maxPileSize = 1000000000; @Argument(fullName="EntropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false) public double MISMATCH_THRESHOLD = 0.25; - @Argument(fullName="GreedyThreshold", shortName="greedy", doc="coverage above which the cleaner turns on greedy mode to improve performance", required=false) - public int GREEDY_THRESHOLD = 500; + @Argument(fullName="GreedyThreshold", shortName="greedy", doc="coverage (of reads with mismatches only) above which the cleaner turns on greedy mode to improve performance", required=false) + public int GREEDY_THRESHOLD = 100; public static final int MAX_QUAL = 99; - private SAMFileWriter writer; + private SAMFileWriter writer = null; private FileWriter indelOutput = null; private FileWriter statsOutput = null; // we need to sort the reads ourselves because SAM headers get messed up and claim to be "unsorted" sometimes - private TreeSet readsToWrite; + private TreeSet readsToWrite = null; public void initialize() { @@ -52,7 +50,10 @@ public class IntervalCleanerWalker extends LocusWindowWalker throw new RuntimeException("Entropy threshold must be a fraction between 0 and 1"); SAMFileHeader header = getToolkit().getEngine().getSAMHeader(); - writer = Utils.createSAMFileWriterWithCompression(header, false, OUT, getToolkit().getBAMCompression()); + if ( OUT != null ) { + writer = Utils.createSAMFileWriterWithCompression(header, false, OUT, getToolkit().getBAMCompression()); + readsToWrite = new TreeSet(); + } logger.info("Writing into output BAM file at compression level " + getToolkit().getBAMCompression()); logger.info("Temporary space used: "+System.getProperty("java.io.tmpdir")); @@ -75,7 +76,6 @@ public class IntervalCleanerWalker extends LocusWindowWalker statsOutput = null; } } - readsToWrite = new TreeSet(); } // do we care about reads that are not part of our intervals? @@ -85,12 +85,14 @@ public class IntervalCleanerWalker extends LocusWindowWalker // What do we do with the reads that are not part of our intervals? public void nonIntervalReadAction(SAMRecord read) { - try { - writer.addAlignment(read); - } catch (Exception e ) { - logger.error("Failed to write read "+read.getReadName()+" aligned at "+read.getReferenceName() +":"+read.getAlignmentStart()); - e.printStackTrace(out); - throw new StingException(e.getMessage()); + if ( writer != null ) { + try { + writer.addAlignment(read); + } catch (Exception e ) { + logger.error("Failed to write read "+read.getReadName()+" aligned at "+read.getReferenceName() +":"+read.getAlignmentStart()); + e.printStackTrace(out); + throw new StingException(e.getMessage()); + } } } @@ -105,39 +107,21 @@ public class IntervalCleanerWalker extends LocusWindowWalker read.getMappingQuality() != 0 && read.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) goodReads.add(read); - else + else if ( writer != null ) readsToWrite.add(new ComparableSAMRecord(read)); } - if ( goodReads.size() > maxPileSize ) { - // too many reads, shy away! - - if ( statsOutput != null ) { - try { - statsOutput.write(context.getLocation().toString()); - statsOutput.write("\tSKIPPED ("+reads.size()+" reads total, "+goodReads.size()+" for realignment)\t"); - statsOutput.write("-1.0"); - statsOutput.write("\n"); - statsOutput.flush(); - } catch (Exception e) {} - - } - // push all "good" reads into readsToWrite without cleaning, there are too many! - for ( SAMRecord read : goodReads ) { - readsToWrite.add(new ComparableSAMRecord(read)); - } - goodReads.clear(); - } else { - clean(goodReads, ref, context.getLocation()); - } + clean(goodReads, ref, context.getLocation()); //bruteForceClean(goodReads, ref, context.getLocation().getStart()); //testCleanWithDeletion(); //testCleanWithInsertion(); - Iterator iter = readsToWrite.iterator(); - while ( iter.hasNext() ) - writer.addAlignment(iter.next().getRecord()); - readsToWrite.clear(); + if ( writer != null ) { + Iterator iter = readsToWrite.iterator(); + while ( iter.hasNext() ) + writer.addAlignment(iter.next().getRecord()); + readsToWrite.clear(); + } return 1; } @@ -151,7 +135,9 @@ public class IntervalCleanerWalker extends LocusWindowWalker public void onTraversalDone(Integer result) { out.println("Saw " + result + " intervals"); - writer.close(); + if ( writer != null ) { + writer.close(); + } if ( OUT_INDELS != null ) { try { indelOutput.close(); @@ -282,6 +268,14 @@ public class IntervalCleanerWalker extends LocusWindowWalker } } + // if we have too many reads with mismatches, be greedy + if ( altReads.size() > GREEDY_THRESHOLD) { + logger.debug("Downsampling from " + altReads.size() + " to " + GREEDY_THRESHOLD + " mismatching reads"); + //sortByGreedy(); + for ( int i = GREEDY_THRESHOLD; i < altReads.size(); i++) + altAlignmentsToTest.set(i, false); + } + Consensus bestConsensus = null; // for each alternative consensus to test, align it to the reference and create an alternative consensus @@ -456,10 +450,12 @@ public class IntervalCleanerWalker extends LocusWindowWalker } // write them out - for ( SAMRecord rec : refReads ) - readsToWrite.add(new ComparableSAMRecord(rec)); - for ( AlignedRead aRec : altReads ) - readsToWrite.add(new ComparableSAMRecord(aRec.getRead())); + if ( writer != null ) { + for ( SAMRecord rec : refReads ) + readsToWrite.add(new ComparableSAMRecord(rec)); + for ( AlignedRead aRec : altReads ) + readsToWrite.add(new ComparableSAMRecord(aRec.getRead())); + } } private Pair findBestOffset(String ref, AlignedRead read) {