Now optionally outputs whether "SNPs" are maintained/cleaned out/introduced by cleaning

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1013 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-06-15 20:02:02 +00:00
parent 29df74ae23
commit a6477df6d1
1 changed files with 42 additions and 1 deletions

View File

@ -25,6 +25,8 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
public boolean printAllReads = false;
@Argument(fullName="statisticsFile", shortName="stats", doc="print out statistics (what does or doesn't get cleaned)", required=false)
public String OUT_STATS = null;
@Argument(fullName="SNPsFile", shortName="snps", doc="print out whether mismatching columns do or don't get cleaned out", required=false)
public String OUT_SNPS = null;
@Argument(fullName="LODThresholdForCleaning", shortName="LOD", doc="LOD threshold above which the cleaner will clean", required=false)
public double LOD_THRESHOLD = 5.0;
@Argument(fullName="EntropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false)
@ -37,6 +39,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
private SAMFileWriter writer = null;
private FileWriter indelOutput = null;
private FileWriter statsOutput = null;
private FileWriter snpsOutput = null;
// we need to sort the reads ourselves because SAM headers get messed up and claim to be "unsorted" sometimes
@ -76,6 +79,15 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
statsOutput = null;
}
}
if ( OUT_SNPS != null ) {
try {
snpsOutput = new FileWriter(new File(OUT_SNPS));
} catch (Exception e) {
logger.warn("Failed to create output file "+ OUT_SNPS+". Cleaning snps output will be suppressed");
err.println(e.getMessage());
snpsOutput = null;
}
}
}
// do we care about reads that are not part of our intervals?
@ -152,6 +164,13 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
logger.error("Failed to close "+OUT_STATS+" gracefully. Data may be corrupt.");
}
}
if ( OUT_SNPS != null ) {
try {
snpsOutput.close();
} catch (Exception e) {
logger.error("Failed to close "+OUT_SNPS+" gracefully. Data may be corrupt.");
}
}
}
@ -616,6 +635,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
}
int originalMismatchColumns = 0, cleanedMismatchColumns = 0;
StringBuffer sb = new StringBuffer();
for ( int i=0; i < reference.length(); i++ ) {
if ( cleanedMismatchBases[i] == originalMismatchBases[i] )
continue;
@ -623,11 +643,32 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
originalMismatchColumns++;
if ( cleanedMismatchBases[i] > totalBases[i] * MISMATCH_THRESHOLD )
cleanedMismatchColumns++;
if ( snpsOutput != null ) {
if ( originalMismatchBases[i] > totalBases[i] * MISMATCH_THRESHOLD ) {
sb.append(reads.get(0).getRead().getReferenceName() + ":");
sb.append(((int)leftmostIndex + i));
if ( cleanedMismatchBases[i] > totalBases[i] * MISMATCH_THRESHOLD )
sb.append(" SAME_SNP\n");
else
sb.append(" NOT_SNP\n");
} else if ( cleanedMismatchBases[i] > totalBases[i] * MISMATCH_THRESHOLD ) {
sb.append(reads.get(0).getRead().getReferenceName() + ":");
sb.append(((int)leftmostIndex + i));
sb.append(" NEW_SNP\n");
}
}
}
logger.debug("Original mismatch columns = " + originalMismatchColumns + "; cleaned mismatch columns = " + cleanedMismatchColumns);
return (originalMismatchColumns == 0 || cleanedMismatchColumns < originalMismatchColumns);
boolean reduces = (originalMismatchColumns == 0 || cleanedMismatchColumns < originalMismatchColumns);
if ( reduces && snpsOutput != null ) {
try {
snpsOutput.write(sb.toString());
snpsOutput.flush();
} catch (Exception e) {}
}
return reduces;
}
/** Takes the alignment of the read sequence <code>readSeq</code> to the reference sequence <code>refSeq</code>