Optimizations to the cleaner algorithm; reduce total runtime by almost 20%.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2852 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
8fd3351971
commit
7669eaaeb3
|
|
@ -41,8 +41,14 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
||||||
@Argument(fullName="bam_compression", shortName="compress", required=false, doc="Compression level to use for output bams [default:5]")
|
@Argument(fullName="bam_compression", shortName="compress", required=false, doc="Compression level to use for output bams [default:5]")
|
||||||
protected Integer compressionLevel = 5;
|
protected Integer compressionLevel = 5;
|
||||||
|
|
||||||
@Argument(fullName="sortInMemory", shortName="sortInMemory", required=false, doc="Should we sort in memory instead of on disk? This option is much faster but should be used with care - with too much coverage or with long reads it might generate failures [default:no]")
|
public enum RealignerSortingStrategy {
|
||||||
protected boolean SORT_IN_MEMORY = false;
|
NO_SORT,
|
||||||
|
ON_DISK,
|
||||||
|
IN_MEMORY
|
||||||
|
}
|
||||||
|
|
||||||
|
@Argument(fullName="sortStrategy", shortName="sort", required=false, doc="What type of sorting strategy should we use? Options include NO_SORT, ON_DISK, and IN_MEMORY. Sorting in memory is much faster than on disk but should be used with care - with too much coverage or with long reads it might generate failures [default:ON_DISK]")
|
||||||
|
protected RealignerSortingStrategy SORTING_STRATEGY = RealignerSortingStrategy.ON_DISK;
|
||||||
|
|
||||||
// ADVANCED OPTIONS FOLLOW
|
// ADVANCED OPTIONS FOLLOW
|
||||||
|
|
||||||
|
|
@ -122,16 +128,22 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
||||||
if ( NWAY_OUTPUT ) {
|
if ( NWAY_OUTPUT ) {
|
||||||
Map<File, SAMFileReader> readerMap = getToolkit().getFileToReaderMapping();
|
Map<File, SAMFileReader> readerMap = getToolkit().getFileToReaderMapping();
|
||||||
for ( File file : readerMap.keySet() ) {
|
for ( File file : readerMap.keySet() ) {
|
||||||
|
SAMFileHeader header = readerMap.get(file).getFileHeader();
|
||||||
|
if ( SORTING_STRATEGY == RealignerSortingStrategy.NO_SORT )
|
||||||
|
header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
|
||||||
String newFileName = file.getName().substring(0, file.getName().length()-3) + outputSuffix + ".bam";
|
String newFileName = file.getName().substring(0, file.getName().length()-3) + outputSuffix + ".bam";
|
||||||
SAMFileWriter writer = factory.makeBAMWriter(readerMap.get(file).getFileHeader(), SORT_IN_MEMORY, new File(baseWriterFilename, newFileName), compressionLevel);
|
SAMFileWriter writer = factory.makeBAMWriter(header, SORTING_STRATEGY == RealignerSortingStrategy.IN_MEMORY, new File(baseWriterFilename, newFileName), compressionLevel);
|
||||||
if ( SORT_IN_MEMORY )
|
if ( SORTING_STRATEGY == RealignerSortingStrategy.IN_MEMORY )
|
||||||
writer = new SortingSAMFileWriter(writer, SORTING_WRITER_WINDOW);
|
writer = new SortingSAMFileWriter(writer, SORTING_WRITER_WINDOW);
|
||||||
for ( String rg : readGroupMap.get(file) )
|
for ( String rg : readGroupMap.get(file) )
|
||||||
writers.put(rg, writer);
|
writers.put(rg, writer);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
SAMFileWriter writer = factory.makeBAMWriter(getToolkit().getSAMFileHeader(), SORT_IN_MEMORY, new File(baseWriterFilename), compressionLevel);
|
SAMFileHeader header = getToolkit().getSAMFileHeader();
|
||||||
if ( SORT_IN_MEMORY )
|
if ( SORTING_STRATEGY == RealignerSortingStrategy.NO_SORT )
|
||||||
|
header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
|
||||||
|
SAMFileWriter writer = factory.makeBAMWriter(header, SORTING_STRATEGY == RealignerSortingStrategy.IN_MEMORY, new File(baseWriterFilename), compressionLevel);
|
||||||
|
if ( SORTING_STRATEGY == RealignerSortingStrategy.IN_MEMORY )
|
||||||
writer = new SortingSAMFileWriter(writer, SORTING_WRITER_WINDOW);
|
writer = new SortingSAMFileWriter(writer, SORTING_WRITER_WINDOW);
|
||||||
for ( Set<String> set : readGroupMap.values() ) {
|
for ( Set<String> set : readGroupMap.values() ) {
|
||||||
for ( String rg : set )
|
for ( String rg : set )
|
||||||
|
|
@ -203,7 +215,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
||||||
}
|
}
|
||||||
|
|
||||||
for ( Map.Entry<SAMFileWriter, Set<SAMRecord>> entry : bins.entrySet() ) {
|
for ( Map.Entry<SAMFileWriter, Set<SAMRecord>> entry : bins.entrySet() ) {
|
||||||
if ( SORT_IN_MEMORY ) {
|
if ( SORTING_STRATEGY == RealignerSortingStrategy.IN_MEMORY ) {
|
||||||
// we can be efficient in this case by batching the reads all together
|
// we can be efficient in this case by batching the reads all together
|
||||||
((SortingSAMFileWriter)entry.getKey()).addAlignments(entry.getValue());
|
((SortingSAMFileWriter)entry.getKey()).addAlignments(entry.getValue());
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -260,7 +272,9 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
||||||
readsToClean.clear();
|
readsToClean.clear();
|
||||||
readsNotToClean.clear();
|
readsNotToClean.clear();
|
||||||
|
|
||||||
currentInterval = intervals.hasNext() ? intervals.next() : null;
|
do {
|
||||||
|
currentInterval = intervals.hasNext() ? intervals.next() : null;
|
||||||
|
} while ( currentInterval != null && currentInterval.isBefore(readLoc) );
|
||||||
|
|
||||||
// call back into map now that the state has been updated
|
// call back into map now that the state has been updated
|
||||||
map(ref, read);
|
map(ref, read);
|
||||||
|
|
@ -291,7 +305,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
||||||
HashSet<SAMFileWriter> uniqueWriters = new HashSet<SAMFileWriter>(writers.values());
|
HashSet<SAMFileWriter> uniqueWriters = new HashSet<SAMFileWriter>(writers.values());
|
||||||
for ( SAMFileWriter writer : uniqueWriters ) {
|
for ( SAMFileWriter writer : uniqueWriters ) {
|
||||||
writer.close();
|
writer.close();
|
||||||
if ( SORT_IN_MEMORY )
|
if ( SORTING_STRATEGY == RealignerSortingStrategy.IN_MEMORY )
|
||||||
((SortingSAMFileWriter)writer).getBaseWriter().close();
|
((SortingSAMFileWriter)writer).getBaseWriter().close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -319,20 +333,27 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static int mismatchQualitySumIgnoreCigar(final AlignedRead aRead, final byte[] refSeq, int refIndex) {
|
private static int mismatchQualitySumIgnoreCigar(final AlignedRead aRead, final byte[] refSeq, int refIndex, int quitAboveThisValue) {
|
||||||
final byte[] readSeq = aRead.getRead().getReadBases();
|
final byte[] readSeq = aRead.getRead().getReadBases();
|
||||||
final byte[] quals = aRead.getRead().getBaseQualities();
|
final byte[] quals = aRead.getRead().getBaseQualities();
|
||||||
int sum = 0;
|
int sum = 0;
|
||||||
for (int readIndex = 0 ; readIndex < readSeq.length ; refIndex++, readIndex++ ) {
|
for (int readIndex = 0 ; readIndex < readSeq.length ; refIndex++, readIndex++ ) {
|
||||||
if ( refIndex >= refSeq.length )
|
if ( refIndex >= refSeq.length ) {
|
||||||
sum += MAX_QUAL;
|
sum += MAX_QUAL;
|
||||||
else {
|
// optimization: once we pass the threshold, stop calculating
|
||||||
|
if ( sum > quitAboveThisValue )
|
||||||
|
return sum;
|
||||||
|
} else {
|
||||||
byte refChr = refSeq[refIndex];
|
byte refChr = refSeq[refIndex];
|
||||||
byte readChr = readSeq[readIndex];
|
byte readChr = readSeq[readIndex];
|
||||||
if ( !BaseUtils.isRegularBase((char)readChr) || !BaseUtils.isRegularBase((char)refChr) )
|
if ( !BaseUtils.isRegularBase((char)readChr) || !BaseUtils.isRegularBase((char)refChr) )
|
||||||
continue; // do not count Ns/Xs/etc ?
|
continue; // do not count Ns/Xs/etc ?
|
||||||
if ( readChr != refChr )
|
if ( readChr != refChr ) {
|
||||||
sum += (int)quals[readIndex];
|
sum += (int)quals[readIndex];
|
||||||
|
// optimization: once we pass the threshold, stop calculating
|
||||||
|
if ( sum > quitAboveThisValue )
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return sum;
|
return sum;
|
||||||
|
|
@ -396,7 +417,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final int mismatchScore = mismatchQualitySumIgnoreCigar(aRead, reference, read.getAlignmentStart()-(int)leftmostIndex);
|
final int mismatchScore = mismatchQualitySumIgnoreCigar(aRead, reference, read.getAlignmentStart()-(int)leftmostIndex, Integer.MAX_VALUE);
|
||||||
// if ( debugOn ) System.out.println("mismatchScore="+mismatchScore);
|
// if ( debugOn ) System.out.println("mismatchScore="+mismatchScore);
|
||||||
|
|
||||||
// if this doesn't match perfectly to the reference, let's try to clean it
|
// if this doesn't match perfectly to the reference, let's try to clean it
|
||||||
|
|
@ -457,12 +478,18 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
||||||
|
|
||||||
while ( iter.hasNext() ) {
|
while ( iter.hasNext() ) {
|
||||||
Consensus consensus = iter.next();
|
Consensus consensus = iter.next();
|
||||||
|
int indelSize = 0;
|
||||||
|
for ( CigarElement ce : consensus.cigar.getCigarElements() )
|
||||||
|
if ( ce.getOperator() == CigarOperator.I || ce.getOperator() == CigarOperator.D ) {
|
||||||
|
indelSize = ce.getLength();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
// if ( debugOn ) System.out.println("Consensus: "+consensus.str);
|
// if ( debugOn ) System.out.println("Consensus: "+consensus.str);
|
||||||
|
|
||||||
for ( int j = 0; j < altReads.size(); j++ ) {
|
for ( int j = 0; j < altReads.size(); j++ ) {
|
||||||
AlignedRead toTest = altReads.get(j);
|
AlignedRead toTest = altReads.get(j);
|
||||||
Pair<Integer, Integer> altAlignment = findBestOffset(consensus.str, toTest);
|
Pair<Integer, Integer> altAlignment = findBestOffset(consensus.str, toTest, (int)leftmostIndex, indelSize);
|
||||||
|
|
||||||
// the mismatch score is the min of its alignment vs. the reference and vs. the alternate
|
// the mismatch score is the min of its alignment vs. the reference and vs. the alternate
|
||||||
int myScore = altAlignment.second;
|
int myScore = altAlignment.second;
|
||||||
|
|
@ -668,20 +695,41 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
||||||
return new Consensus(altConsensus, cigar, indexOnRef);
|
return new Consensus(altConsensus, cigar, indexOnRef);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Pair<Integer, Integer> findBestOffset(final byte[] ref, final AlignedRead read) {
|
private Pair<Integer, Integer> findBestOffset(final byte[] ref, final AlignedRead read, final int leftmostIndex, int indelSize) {
|
||||||
final int attempts = ref.length - read.getReadLength() + 1;
|
|
||||||
int bestScore = mismatchQualitySumIgnoreCigar(read, ref, 0);
|
// optimization: try the most likely alignment first (to get a low score to beat)
|
||||||
int bestIndex = 0;
|
int originalAlignment = read.getOriginalAlignmentStart() - leftmostIndex;
|
||||||
for ( int i = 1; i < attempts; i++ ) {
|
int bestScore = mismatchQualitySumIgnoreCigar(read, ref, originalAlignment, Integer.MAX_VALUE);
|
||||||
// we can't get better than 0!
|
int bestIndex = originalAlignment;
|
||||||
if ( bestScore == 0 )
|
|
||||||
return new Pair<Integer, Integer>(bestIndex, 0);
|
// optimization: we can't get better than 0, so we can quit now
|
||||||
int score = mismatchQualitySumIgnoreCigar(read, ref, i);
|
if ( bestScore == 0 )
|
||||||
|
return new Pair<Integer, Integer>(bestIndex, 0);
|
||||||
|
|
||||||
|
// optimization: the correct alignment shouldn't be too far from the original one (or else the read wouldn't have aligned in the first place)
|
||||||
|
for ( int i = 0; i < originalAlignment; i++ ) {
|
||||||
|
int score = mismatchQualitySumIgnoreCigar(read, ref, i, bestScore);
|
||||||
if ( score < bestScore ) {
|
if ( score < bestScore ) {
|
||||||
bestScore = score;
|
bestScore = score;
|
||||||
bestIndex = i;
|
bestIndex = i;
|
||||||
}
|
}
|
||||||
|
// optimization: we can't get better than 0, so we can quit now
|
||||||
|
if ( bestScore == 0 )
|
||||||
|
return new Pair<Integer, Integer>(bestIndex, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final int maxPossibleStart = ref.length - read.getReadLength();
|
||||||
|
for ( int i = originalAlignment + 1; i <= maxPossibleStart; i++ ) {
|
||||||
|
int score = mismatchQualitySumIgnoreCigar(read, ref, i, bestScore);
|
||||||
|
if ( score < bestScore ) {
|
||||||
|
bestScore = score;
|
||||||
|
bestIndex = i;
|
||||||
|
}
|
||||||
|
// optimization: we can't get better than 0, so we can quit now
|
||||||
|
if ( bestScore == 0 )
|
||||||
|
return new Pair<Integer, Integer>(bestIndex, 0);
|
||||||
|
}
|
||||||
|
|
||||||
return new Pair<Integer, Integer>(bestIndex, bestScore);
|
return new Pair<Integer, Integer>(bestIndex, bestScore);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue