From 4e41646c881a5a7a3e43fae68e77c0b06f0fcd1a Mon Sep 17 00:00:00 2001 From: ebanks Date: Wed, 3 Jun 2009 17:45:35 +0000 Subject: [PATCH] print out stats for Andrey git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@890 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/indels/IntervalCleanerWalker.java | 64 +++++++++++++++---- 1 file changed, 51 insertions(+), 13 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java index f5057dc40..691a750cd 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java @@ -1,8 +1,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.indels; -import org.broadinstitute.sting.utils.Pair; -import org.broadinstitute.sting.utils.ComparableSAMRecord; +import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.gatk.walkers.LocusWindowWalker; import org.broadinstitute.sting.gatk.walkers.WalkerName; @@ -26,18 +25,24 @@ public class IntervalCleanerWalker extends LocusWindowWalker public String OUT_INDELS = null; @Argument(fullName="OutputAllReads", shortName="all", doc="print out all reads (otherwise, just those within the intervals)", required=false) public boolean printAllReads = false; + @Argument(fullName="statisticsFile", shortName="stats", doc="print out statistics (what does or doesn't get cleaned)", required=false) + public String OUT_STATS = null; @Argument(fullName="LODThresholdForCleaning", shortName="LOD", doc="LOD threshold above which the cleaner will clean", required=false) public double LOD_THRESHOLD = 5.0; public static final int MAX_QUAL = 99; private SAMFileWriter writer; - private FileWriter indelOutput = null; + private FileWriter indelOutput = null; + private FileWriter statsOutput = null; // we need to sort the reads ourselves because SAM headers get messed up and claim to be "unsorted" sometimes private TreeSet readsToWrite; public void initialize() { + if ( LOD_THRESHOLD < 0.0 ) + throw new RuntimeException("LOD threshold cannot be a negative number"); + SAMFileHeader header = getToolkit().getEngine().getSAMHeader(); writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, new File(OUT)); if ( OUT_INDELS != null ) { @@ -48,6 +53,14 @@ public class IntervalCleanerWalker extends LocusWindowWalker indelOutput = null; } } + if ( OUT_STATS != null ) { + try { + statsOutput = new FileWriter(new File(OUT_STATS)); + } catch (Exception e) { + err.println(e.getMessage()); + statsOutput = null; + } + } readsToWrite = new TreeSet(); } @@ -75,7 +88,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker else readsToWrite.add(new ComparableSAMRecord(read)); } - clean(goodReads, ref, context.getLocation().getStart()); + clean(goodReads, ref, context.getLocation()); //bruteForceClean(goodReads, ref, context.getLocation().getStart()); //testCleanWithDeletion(); //testCleanWithInsertion(); @@ -101,9 +114,12 @@ public class IntervalCleanerWalker extends LocusWindowWalker if ( OUT_INDELS != null ) { try { indelOutput.close(); - } catch (Exception e) { - err.println(e.getMessage()); - } + } catch (Exception e) {} + } + if ( OUT_STATS != null ) { + try { + statsOutput.close(); + } catch (Exception e) {} } } @@ -162,8 +178,9 @@ public class IntervalCleanerWalker extends LocusWindowWalker return sum; } - private void clean(List reads, String reference, long leftmostIndex) { + private void clean(List reads, String reference, GenomeLoc interval) { + long leftmostIndex = interval.getStart(); ArrayList refReads = new ArrayList(); ArrayList altReads = new ArrayList(); ArrayList altAlignmentsToTest = new ArrayList(); @@ -271,7 +288,8 @@ public class IntervalCleanerWalker extends LocusWindowWalker } // if the best alternate consensus has a smaller sum of quality score mismatches (more than the LOD threshold), then clean! - if ( bestConsensus != null && ((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0 >= LOD_THRESHOLD ) { + double improvement = (bestConsensus == null ? -1 : ((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0); + if ( improvement >= LOD_THRESHOLD ) { logger.debug("CLEAN: " + bestConsensus.str ); if ( indelOutput != null && bestConsensus.cigar.numCigarElements() > 1 ) { // NOTE: indels are printed out in the format specified for the low-coverage pilot1 @@ -292,21 +310,40 @@ public class IntervalCleanerWalker extends LocusWindowWalker indelOutput.flush(); } catch (Exception e) {} } + if ( statsOutput != null ) { + try { + statsOutput.write(interval.toString()); + statsOutput.write("\tCLEAN"); + if ( bestConsensus.cigar.numCigarElements() > 1 ) + statsOutput.write(" (found indel)"); + statsOutput.write("\t"); + statsOutput.write(Double.toString(improvement)); + statsOutput.write("\n"); + statsOutput.flush(); + } catch (Exception e) {} + } // We need to update the mapping quality score of the cleaned reads; // however we don't have enough info to use the proper MAQ scoring system. // For now, we'll use a heuristic: // the mapping quality score is improved by the LOD difference in mismatching // bases between the reference and alternate consensus - int improvement = (totalMismatchSum - bestConsensus.mismatchSum) / 10; // clean the appropriate reads for ( Pair indexPair : bestConsensus.readIndexes ) { AlignedRead aRead = altReads.get(indexPair.getFirst()); updateRead(bestConsensus.cigar, bestConsensus.positionOnReference, indexPair.getSecond(), aRead, (int)leftmostIndex); - aRead.getRead().setMappingQuality(Math.min(aRead.getRead().getMappingQuality() + improvement, 255)); + aRead.getRead().setMappingQuality(Math.min(aRead.getRead().getMappingQuality() + (int)improvement, 255)); aRead.getRead().setAttribute("NM", numMismatches(aRead.getRead(), reference, aRead.getRead().getAlignmentStart()-(int)leftmostIndex)); } + } else if ( statsOutput != null ) { + try { + statsOutput.write(interval.toString()); + statsOutput.write("\tFAIL\t"); + statsOutput.write(Double.toString(improvement)); + statsOutput.write("\n"); + statsOutput.flush(); + } catch (Exception e) {} } // write them out @@ -442,6 +479,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker } if ( difference > 0 ) { + logger.debug("Realigning indel in " + read.getReadName()); Cigar newCigar = new Cigar(); newCigar.add(new CigarElement(ce1.getLength()-difference, CigarOperator.M)); newCigar.add(ce2); @@ -555,7 +593,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker reads.add(r5); reads.add(r6); reads.add(r7); - clean(reads, reference, 0); + clean(reads, reference, new GenomeLoc(0,0)); } private void testCleanWithDeletion() { @@ -611,7 +649,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker reads.add(r6); reads.add(r7); reads.add(r8); - clean(reads, reference, 0); + clean(reads, reference, new GenomeLoc(0,0)); } private void bruteForceClean(List reads, String reference, long leftmostIndex) {