print out stats for Andrey
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@890 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
dfe464cd81
commit
4e41646c88
|
|
@ -1,8 +1,7 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.playground.gatk.walkers.indels;
|
package org.broadinstitute.sting.playground.gatk.walkers.indels;
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.Pair;
|
import org.broadinstitute.sting.utils.*;
|
||||||
import org.broadinstitute.sting.utils.ComparableSAMRecord;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.*;
|
import org.broadinstitute.sting.gatk.refdata.*;
|
||||||
import org.broadinstitute.sting.gatk.walkers.LocusWindowWalker;
|
import org.broadinstitute.sting.gatk.walkers.LocusWindowWalker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||||
|
|
@ -26,18 +25,24 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
|
||||||
public String OUT_INDELS = null;
|
public String OUT_INDELS = null;
|
||||||
@Argument(fullName="OutputAllReads", shortName="all", doc="print out all reads (otherwise, just those within the intervals)", required=false)
|
@Argument(fullName="OutputAllReads", shortName="all", doc="print out all reads (otherwise, just those within the intervals)", required=false)
|
||||||
public boolean printAllReads = false;
|
public boolean printAllReads = false;
|
||||||
|
@Argument(fullName="statisticsFile", shortName="stats", doc="print out statistics (what does or doesn't get cleaned)", required=false)
|
||||||
|
public String OUT_STATS = null;
|
||||||
@Argument(fullName="LODThresholdForCleaning", shortName="LOD", doc="LOD threshold above which the cleaner will clean", required=false)
|
@Argument(fullName="LODThresholdForCleaning", shortName="LOD", doc="LOD threshold above which the cleaner will clean", required=false)
|
||||||
public double LOD_THRESHOLD = 5.0;
|
public double LOD_THRESHOLD = 5.0;
|
||||||
|
|
||||||
public static final int MAX_QUAL = 99;
|
public static final int MAX_QUAL = 99;
|
||||||
|
|
||||||
private SAMFileWriter writer;
|
private SAMFileWriter writer;
|
||||||
private FileWriter indelOutput = null;
|
private FileWriter indelOutput = null;
|
||||||
|
private FileWriter statsOutput = null;
|
||||||
|
|
||||||
// we need to sort the reads ourselves because SAM headers get messed up and claim to be "unsorted" sometimes
|
// we need to sort the reads ourselves because SAM headers get messed up and claim to be "unsorted" sometimes
|
||||||
private TreeSet<ComparableSAMRecord> readsToWrite;
|
private TreeSet<ComparableSAMRecord> readsToWrite;
|
||||||
|
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
|
if ( LOD_THRESHOLD < 0.0 )
|
||||||
|
throw new RuntimeException("LOD threshold cannot be a negative number");
|
||||||
|
|
||||||
SAMFileHeader header = getToolkit().getEngine().getSAMHeader();
|
SAMFileHeader header = getToolkit().getEngine().getSAMHeader();
|
||||||
writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, new File(OUT));
|
writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, new File(OUT));
|
||||||
if ( OUT_INDELS != null ) {
|
if ( OUT_INDELS != null ) {
|
||||||
|
|
@ -48,6 +53,14 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
|
||||||
indelOutput = null;
|
indelOutput = null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if ( OUT_STATS != null ) {
|
||||||
|
try {
|
||||||
|
statsOutput = new FileWriter(new File(OUT_STATS));
|
||||||
|
} catch (Exception e) {
|
||||||
|
err.println(e.getMessage());
|
||||||
|
statsOutput = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
readsToWrite = new TreeSet<ComparableSAMRecord>();
|
readsToWrite = new TreeSet<ComparableSAMRecord>();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -75,7 +88,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
|
||||||
else
|
else
|
||||||
readsToWrite.add(new ComparableSAMRecord(read));
|
readsToWrite.add(new ComparableSAMRecord(read));
|
||||||
}
|
}
|
||||||
clean(goodReads, ref, context.getLocation().getStart());
|
clean(goodReads, ref, context.getLocation());
|
||||||
//bruteForceClean(goodReads, ref, context.getLocation().getStart());
|
//bruteForceClean(goodReads, ref, context.getLocation().getStart());
|
||||||
//testCleanWithDeletion();
|
//testCleanWithDeletion();
|
||||||
//testCleanWithInsertion();
|
//testCleanWithInsertion();
|
||||||
|
|
@ -101,9 +114,12 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
|
||||||
if ( OUT_INDELS != null ) {
|
if ( OUT_INDELS != null ) {
|
||||||
try {
|
try {
|
||||||
indelOutput.close();
|
indelOutput.close();
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {}
|
||||||
err.println(e.getMessage());
|
}
|
||||||
}
|
if ( OUT_STATS != null ) {
|
||||||
|
try {
|
||||||
|
statsOutput.close();
|
||||||
|
} catch (Exception e) {}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -162,8 +178,9 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void clean(List<SAMRecord> reads, String reference, long leftmostIndex) {
|
private void clean(List<SAMRecord> reads, String reference, GenomeLoc interval) {
|
||||||
|
|
||||||
|
long leftmostIndex = interval.getStart();
|
||||||
ArrayList<SAMRecord> refReads = new ArrayList<SAMRecord>();
|
ArrayList<SAMRecord> refReads = new ArrayList<SAMRecord>();
|
||||||
ArrayList<AlignedRead> altReads = new ArrayList<AlignedRead>();
|
ArrayList<AlignedRead> altReads = new ArrayList<AlignedRead>();
|
||||||
ArrayList<Boolean> altAlignmentsToTest = new ArrayList<Boolean>();
|
ArrayList<Boolean> altAlignmentsToTest = new ArrayList<Boolean>();
|
||||||
|
|
@ -271,7 +288,8 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
|
||||||
}
|
}
|
||||||
|
|
||||||
// if the best alternate consensus has a smaller sum of quality score mismatches (more than the LOD threshold), then clean!
|
// if the best alternate consensus has a smaller sum of quality score mismatches (more than the LOD threshold), then clean!
|
||||||
if ( bestConsensus != null && ((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0 >= LOD_THRESHOLD ) {
|
double improvement = (bestConsensus == null ? -1 : ((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0);
|
||||||
|
if ( improvement >= LOD_THRESHOLD ) {
|
||||||
logger.debug("CLEAN: " + bestConsensus.str );
|
logger.debug("CLEAN: " + bestConsensus.str );
|
||||||
if ( indelOutput != null && bestConsensus.cigar.numCigarElements() > 1 ) {
|
if ( indelOutput != null && bestConsensus.cigar.numCigarElements() > 1 ) {
|
||||||
// NOTE: indels are printed out in the format specified for the low-coverage pilot1
|
// NOTE: indels are printed out in the format specified for the low-coverage pilot1
|
||||||
|
|
@ -292,21 +310,40 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
|
||||||
indelOutput.flush();
|
indelOutput.flush();
|
||||||
} catch (Exception e) {}
|
} catch (Exception e) {}
|
||||||
}
|
}
|
||||||
|
if ( statsOutput != null ) {
|
||||||
|
try {
|
||||||
|
statsOutput.write(interval.toString());
|
||||||
|
statsOutput.write("\tCLEAN");
|
||||||
|
if ( bestConsensus.cigar.numCigarElements() > 1 )
|
||||||
|
statsOutput.write(" (found indel)");
|
||||||
|
statsOutput.write("\t");
|
||||||
|
statsOutput.write(Double.toString(improvement));
|
||||||
|
statsOutput.write("\n");
|
||||||
|
statsOutput.flush();
|
||||||
|
} catch (Exception e) {}
|
||||||
|
}
|
||||||
|
|
||||||
// We need to update the mapping quality score of the cleaned reads;
|
// We need to update the mapping quality score of the cleaned reads;
|
||||||
// however we don't have enough info to use the proper MAQ scoring system.
|
// however we don't have enough info to use the proper MAQ scoring system.
|
||||||
// For now, we'll use a heuristic:
|
// For now, we'll use a heuristic:
|
||||||
// the mapping quality score is improved by the LOD difference in mismatching
|
// the mapping quality score is improved by the LOD difference in mismatching
|
||||||
// bases between the reference and alternate consensus
|
// bases between the reference and alternate consensus
|
||||||
int improvement = (totalMismatchSum - bestConsensus.mismatchSum) / 10;
|
|
||||||
|
|
||||||
// clean the appropriate reads
|
// clean the appropriate reads
|
||||||
for ( Pair<Integer, Integer> indexPair : bestConsensus.readIndexes ) {
|
for ( Pair<Integer, Integer> indexPair : bestConsensus.readIndexes ) {
|
||||||
AlignedRead aRead = altReads.get(indexPair.getFirst());
|
AlignedRead aRead = altReads.get(indexPair.getFirst());
|
||||||
updateRead(bestConsensus.cigar, bestConsensus.positionOnReference, indexPair.getSecond(), aRead, (int)leftmostIndex);
|
updateRead(bestConsensus.cigar, bestConsensus.positionOnReference, indexPair.getSecond(), aRead, (int)leftmostIndex);
|
||||||
aRead.getRead().setMappingQuality(Math.min(aRead.getRead().getMappingQuality() + improvement, 255));
|
aRead.getRead().setMappingQuality(Math.min(aRead.getRead().getMappingQuality() + (int)improvement, 255));
|
||||||
aRead.getRead().setAttribute("NM", numMismatches(aRead.getRead(), reference, aRead.getRead().getAlignmentStart()-(int)leftmostIndex));
|
aRead.getRead().setAttribute("NM", numMismatches(aRead.getRead(), reference, aRead.getRead().getAlignmentStart()-(int)leftmostIndex));
|
||||||
}
|
}
|
||||||
|
} else if ( statsOutput != null ) {
|
||||||
|
try {
|
||||||
|
statsOutput.write(interval.toString());
|
||||||
|
statsOutput.write("\tFAIL\t");
|
||||||
|
statsOutput.write(Double.toString(improvement));
|
||||||
|
statsOutput.write("\n");
|
||||||
|
statsOutput.flush();
|
||||||
|
} catch (Exception e) {}
|
||||||
}
|
}
|
||||||
|
|
||||||
// write them out
|
// write them out
|
||||||
|
|
@ -442,6 +479,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( difference > 0 ) {
|
if ( difference > 0 ) {
|
||||||
|
logger.debug("Realigning indel in " + read.getReadName());
|
||||||
Cigar newCigar = new Cigar();
|
Cigar newCigar = new Cigar();
|
||||||
newCigar.add(new CigarElement(ce1.getLength()-difference, CigarOperator.M));
|
newCigar.add(new CigarElement(ce1.getLength()-difference, CigarOperator.M));
|
||||||
newCigar.add(ce2);
|
newCigar.add(ce2);
|
||||||
|
|
@ -555,7 +593,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
|
||||||
reads.add(r5);
|
reads.add(r5);
|
||||||
reads.add(r6);
|
reads.add(r6);
|
||||||
reads.add(r7);
|
reads.add(r7);
|
||||||
clean(reads, reference, 0);
|
clean(reads, reference, new GenomeLoc(0,0));
|
||||||
}
|
}
|
||||||
|
|
||||||
private void testCleanWithDeletion() {
|
private void testCleanWithDeletion() {
|
||||||
|
|
@ -611,7 +649,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
|
||||||
reads.add(r6);
|
reads.add(r6);
|
||||||
reads.add(r7);
|
reads.add(r7);
|
||||||
reads.add(r8);
|
reads.add(r8);
|
||||||
clean(reads, reference, 0);
|
clean(reads, reference, new GenomeLoc(0,0));
|
||||||
}
|
}
|
||||||
|
|
||||||
private void bruteForceClean(List<SAMRecord> reads, String reference, long leftmostIndex) {
|
private void bruteForceClean(List<SAMRecord> reads, String reference, long leftmostIndex) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue