From a6477df6d1cd52379c8b66b3fd0cf2237d1bf3a3 Mon Sep 17 00:00:00 2001 From: ebanks Date: Mon, 15 Jun 2009 20:02:02 +0000 Subject: [PATCH] Now optionally outputs whether "SNPs" are maintained/cleaned out/introduced by cleaning git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1013 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/indels/IntervalCleanerWalker.java | 43 ++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java index 3769c1ec1..2888b0fd7 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java @@ -25,6 +25,8 @@ public class IntervalCleanerWalker extends LocusWindowWalker public boolean printAllReads = false; @Argument(fullName="statisticsFile", shortName="stats", doc="print out statistics (what does or doesn't get cleaned)", required=false) public String OUT_STATS = null; + @Argument(fullName="SNPsFile", shortName="snps", doc="print out whether mismatching columns do or don't get cleaned out", required=false) + public String OUT_SNPS = null; @Argument(fullName="LODThresholdForCleaning", shortName="LOD", doc="LOD threshold above which the cleaner will clean", required=false) public double LOD_THRESHOLD = 5.0; @Argument(fullName="EntropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false) @@ -37,6 +39,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker private SAMFileWriter writer = null; private FileWriter indelOutput = null; private FileWriter statsOutput = null; + private FileWriter snpsOutput = null; // we need to sort the reads ourselves because SAM headers get messed up and claim to be "unsorted" sometimes @@ -76,6 +79,15 @@ public class IntervalCleanerWalker extends LocusWindowWalker statsOutput = null; } } + if ( OUT_SNPS != null ) { + try { + snpsOutput = new FileWriter(new File(OUT_SNPS)); + } catch (Exception e) { + logger.warn("Failed to create output file "+ OUT_SNPS+". Cleaning snps output will be suppressed"); + err.println(e.getMessage()); + snpsOutput = null; + } + } } // do we care about reads that are not part of our intervals? @@ -152,6 +164,13 @@ public class IntervalCleanerWalker extends LocusWindowWalker logger.error("Failed to close "+OUT_STATS+" gracefully. Data may be corrupt."); } } + if ( OUT_SNPS != null ) { + try { + snpsOutput.close(); + } catch (Exception e) { + logger.error("Failed to close "+OUT_SNPS+" gracefully. Data may be corrupt."); + } + } } @@ -616,6 +635,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker } int originalMismatchColumns = 0, cleanedMismatchColumns = 0; + StringBuffer sb = new StringBuffer(); for ( int i=0; i < reference.length(); i++ ) { if ( cleanedMismatchBases[i] == originalMismatchBases[i] ) continue; @@ -623,11 +643,32 @@ public class IntervalCleanerWalker extends LocusWindowWalker originalMismatchColumns++; if ( cleanedMismatchBases[i] > totalBases[i] * MISMATCH_THRESHOLD ) cleanedMismatchColumns++; + if ( snpsOutput != null ) { + if ( originalMismatchBases[i] > totalBases[i] * MISMATCH_THRESHOLD ) { + sb.append(reads.get(0).getRead().getReferenceName() + ":"); + sb.append(((int)leftmostIndex + i)); + if ( cleanedMismatchBases[i] > totalBases[i] * MISMATCH_THRESHOLD ) + sb.append(" SAME_SNP\n"); + else + sb.append(" NOT_SNP\n"); + } else if ( cleanedMismatchBases[i] > totalBases[i] * MISMATCH_THRESHOLD ) { + sb.append(reads.get(0).getRead().getReferenceName() + ":"); + sb.append(((int)leftmostIndex + i)); + sb.append(" NEW_SNP\n"); + } + } } logger.debug("Original mismatch columns = " + originalMismatchColumns + "; cleaned mismatch columns = " + cleanedMismatchColumns); - return (originalMismatchColumns == 0 || cleanedMismatchColumns < originalMismatchColumns); + boolean reduces = (originalMismatchColumns == 0 || cleanedMismatchColumns < originalMismatchColumns); + if ( reduces && snpsOutput != null ) { + try { + snpsOutput.write(sb.toString()); + snpsOutput.flush(); + } catch (Exception e) {} + } + return reduces; } /** Takes the alignment of the read sequence readSeq to the reference sequence refSeq