print out stats for Andrey

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@890 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-06-03 17:45:35 +00:00
parent dfe464cd81
commit 4e41646c88
1 changed files with 51 additions and 13 deletions

View File

@ -1,8 +1,7 @@
package org.broadinstitute.sting.playground.gatk.walkers.indels;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.ComparableSAMRecord;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.LocusWindowWalker;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
@ -26,18 +25,24 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
public String OUT_INDELS = null;
@Argument(fullName="OutputAllReads", shortName="all", doc="print out all reads (otherwise, just those within the intervals)", required=false)
public boolean printAllReads = false;
@Argument(fullName="statisticsFile", shortName="stats", doc="print out statistics (what does or doesn't get cleaned)", required=false)
public String OUT_STATS = null;
@Argument(fullName="LODThresholdForCleaning", shortName="LOD", doc="LOD threshold above which the cleaner will clean", required=false)
public double LOD_THRESHOLD = 5.0;
public static final int MAX_QUAL = 99;
private SAMFileWriter writer;
private FileWriter indelOutput = null;
private FileWriter indelOutput = null;
private FileWriter statsOutput = null;
// we need to sort the reads ourselves because SAM headers get messed up and claim to be "unsorted" sometimes
private TreeSet<ComparableSAMRecord> readsToWrite;
public void initialize() {
if ( LOD_THRESHOLD < 0.0 )
throw new RuntimeException("LOD threshold cannot be a negative number");
SAMFileHeader header = getToolkit().getEngine().getSAMHeader();
writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, new File(OUT));
if ( OUT_INDELS != null ) {
@ -48,6 +53,14 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
indelOutput = null;
}
}
if ( OUT_STATS != null ) {
try {
statsOutput = new FileWriter(new File(OUT_STATS));
} catch (Exception e) {
err.println(e.getMessage());
statsOutput = null;
}
}
readsToWrite = new TreeSet<ComparableSAMRecord>();
}
@ -75,7 +88,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
else
readsToWrite.add(new ComparableSAMRecord(read));
}
clean(goodReads, ref, context.getLocation().getStart());
clean(goodReads, ref, context.getLocation());
//bruteForceClean(goodReads, ref, context.getLocation().getStart());
//testCleanWithDeletion();
//testCleanWithInsertion();
@ -101,9 +114,12 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
if ( OUT_INDELS != null ) {
try {
indelOutput.close();
} catch (Exception e) {
err.println(e.getMessage());
}
} catch (Exception e) {}
}
if ( OUT_STATS != null ) {
try {
statsOutput.close();
} catch (Exception e) {}
}
}
@ -162,8 +178,9 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
return sum;
}
private void clean(List<SAMRecord> reads, String reference, long leftmostIndex) {
private void clean(List<SAMRecord> reads, String reference, GenomeLoc interval) {
long leftmostIndex = interval.getStart();
ArrayList<SAMRecord> refReads = new ArrayList<SAMRecord>();
ArrayList<AlignedRead> altReads = new ArrayList<AlignedRead>();
ArrayList<Boolean> altAlignmentsToTest = new ArrayList<Boolean>();
@ -271,7 +288,8 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
}
// if the best alternate consensus has a smaller sum of quality score mismatches (more than the LOD threshold), then clean!
if ( bestConsensus != null && ((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0 >= LOD_THRESHOLD ) {
double improvement = (bestConsensus == null ? -1 : ((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0);
if ( improvement >= LOD_THRESHOLD ) {
logger.debug("CLEAN: " + bestConsensus.str );
if ( indelOutput != null && bestConsensus.cigar.numCigarElements() > 1 ) {
// NOTE: indels are printed out in the format specified for the low-coverage pilot1
@ -292,21 +310,40 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
indelOutput.flush();
} catch (Exception e) {}
}
if ( statsOutput != null ) {
try {
statsOutput.write(interval.toString());
statsOutput.write("\tCLEAN");
if ( bestConsensus.cigar.numCigarElements() > 1 )
statsOutput.write(" (found indel)");
statsOutput.write("\t");
statsOutput.write(Double.toString(improvement));
statsOutput.write("\n");
statsOutput.flush();
} catch (Exception e) {}
}
// We need to update the mapping quality score of the cleaned reads;
// however we don't have enough info to use the proper MAQ scoring system.
// For now, we'll use a heuristic:
// the mapping quality score is improved by the LOD difference in mismatching
// bases between the reference and alternate consensus
int improvement = (totalMismatchSum - bestConsensus.mismatchSum) / 10;
// clean the appropriate reads
for ( Pair<Integer, Integer> indexPair : bestConsensus.readIndexes ) {
AlignedRead aRead = altReads.get(indexPair.getFirst());
updateRead(bestConsensus.cigar, bestConsensus.positionOnReference, indexPair.getSecond(), aRead, (int)leftmostIndex);
aRead.getRead().setMappingQuality(Math.min(aRead.getRead().getMappingQuality() + improvement, 255));
aRead.getRead().setMappingQuality(Math.min(aRead.getRead().getMappingQuality() + (int)improvement, 255));
aRead.getRead().setAttribute("NM", numMismatches(aRead.getRead(), reference, aRead.getRead().getAlignmentStart()-(int)leftmostIndex));
}
} else if ( statsOutput != null ) {
try {
statsOutput.write(interval.toString());
statsOutput.write("\tFAIL\t");
statsOutput.write(Double.toString(improvement));
statsOutput.write("\n");
statsOutput.flush();
} catch (Exception e) {}
}
// write them out
@ -442,6 +479,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
}
if ( difference > 0 ) {
logger.debug("Realigning indel in " + read.getReadName());
Cigar newCigar = new Cigar();
newCigar.add(new CigarElement(ce1.getLength()-difference, CigarOperator.M));
newCigar.add(ce2);
@ -555,7 +593,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
reads.add(r5);
reads.add(r6);
reads.add(r7);
clean(reads, reference, 0);
clean(reads, reference, new GenomeLoc(0,0));
}
private void testCleanWithDeletion() {
@ -611,7 +649,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
reads.add(r6);
reads.add(r7);
reads.add(r8);
clean(reads, reference, 0);
clean(reads, reference, new GenomeLoc(0,0));
}
private void bruteForceClean(List<SAMRecord> reads, String reference, long leftmostIndex) {