Pulling out the left-alignment code for indels so that other walkers can use it.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3251 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
9e28e4eb42
commit
42bcca1010
|
|
@ -470,7 +470,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
// first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence
|
||||
int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read);
|
||||
if ( numBlocks == 2 ) {
|
||||
Cigar newCigar = indelRealignment(read.getCigar(), reference, read.getReadBases(), read.getAlignmentStart()-(int)leftmostIndex, 0);
|
||||
Cigar newCigar = AlignmentUtils.leftAlignIndel(read.getCigar(), reference, read.getReadBases(), read.getAlignmentStart()-(int)leftmostIndex, 0);
|
||||
aRead.setCigar(newCigar);
|
||||
}
|
||||
|
||||
|
|
@ -600,7 +600,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
final double improvement = (bestConsensus == null ? -1 : ((double)(totalRawMismatchSum - bestConsensus.mismatchSum))/10.0);
|
||||
if ( improvement >= LOD_THRESHOLD && bestConsensus.mismatchSum <= totalAlignerMismatchSum ) {
|
||||
|
||||
bestConsensus.cigar = indelRealignment(bestConsensus.cigar, reference, bestConsensus.str, bestConsensus.positionOnReference, bestConsensus.positionOnReference);
|
||||
bestConsensus.cigar = AlignmentUtils.leftAlignIndel(bestConsensus.cigar, reference, bestConsensus.str, bestConsensus.positionOnReference, bestConsensus.positionOnReference);
|
||||
|
||||
// start cleaning the appropriate reads
|
||||
for ( Pair<Integer, Integer> indexPair : bestConsensus.readIndexes ) {
|
||||
|
|
@ -1005,167 +1005,6 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
return reduces;
|
||||
}
|
||||
|
||||
/** Takes the alignment of the read sequence <code>readSeq</code> to the reference sequence <code>refSeq</code>
|
||||
* starting at 0-based position <code>refIndex</code> on the <code>refSeq</code> and specified by its <code>cigar</code>.
|
||||
* The last argument <code>readIndex</code> specifies 0-based position on the read where the alignment described by the
|
||||
* <code>cigar</code> starts. Usually cigars specify alignments of the whole read to the ref, so that readIndex is normally 0.
|
||||
* Use non-zero readIndex only when the alignment cigar represents alignment of a part of the read. The refIndex in this case
|
||||
* should be the position where the alignment of that part of the read starts at. In other words, both refIndex and readIndex are
|
||||
* always the positions where the cigar starts on the ref and on the read, respectively.
|
||||
*
|
||||
* If the alignment has an indel, then this method attempts moving this indel left across a stretch of repetitive bases. For instance, if the original cigar
|
||||
* specifies that (any) one AT is deleted from a repeat sequence TATATATA, the output cigar will always mark the leftmost AT
|
||||
* as deleted. If there is no indel in the original cigar, or the indel position is determined unambiguously (i.e. inserted/deleted sequence
|
||||
* is not repeated), the original cigar is returned.
|
||||
* @param cigar structure of the original alignment
|
||||
* @param refSeq reference sequence the read is aligned to
|
||||
* @param readSeq read sequence
|
||||
* @param refIndex 0-based alignment start position on ref
|
||||
* @param readIndex 0-based alignment start position on read
|
||||
* @return a cigar, in which indel is guaranteed to be placed at the leftmost possible position across a repeat (if any)
|
||||
*/
|
||||
private Cigar indelRealignment(Cigar cigar, final byte[] refSeq, final byte[] readSeq, final int refIndex, final int readIndex) {
|
||||
if ( cigar.numCigarElements() < 2 ) return cigar; // no indels, nothing to do
|
||||
|
||||
final CigarElement ce1 = cigar.getCigarElement(0);
|
||||
final CigarElement ce2 = cigar.getCigarElement(1);
|
||||
|
||||
// we currently can not handle clipped reads; alternatively, if the alignment starts from insertion, there
|
||||
// is no place on the read to move that insertion further left; so we are done:
|
||||
if ( ce1.getOperator() != CigarOperator.M ) return cigar;
|
||||
|
||||
int difference = 0; // we can move indel 'difference' bases left
|
||||
final int indel_length = ce2.getLength();
|
||||
|
||||
int period = 0; // period of the inserted/deleted sequence
|
||||
int indelIndexOnRef = refIndex+ce1.getLength() ; // position of the indel on the REF (first deleted base or first base after insertion)
|
||||
int indelIndexOnRead = readIndex+ce1.getLength(); // position of the indel on the READ (first insterted base, of first base after deletion)
|
||||
|
||||
byte[] indelString = new byte[ce2.getLength()]; // inserted or deleted sequence
|
||||
|
||||
if ( ce2.getOperator() == CigarOperator.D )
|
||||
System.arraycopy(refSeq, indelIndexOnRef, indelString, 0, ce2.getLength());
|
||||
else if ( ce2.getOperator() == CigarOperator.I )
|
||||
System.arraycopy(readSeq, indelIndexOnRead, indelString, 0, ce2.getLength());
|
||||
else
|
||||
// we can get here if there is soft clipping done at the beginning of the read
|
||||
// for now, we'll just punt the issue and not try to realign these
|
||||
return cigar;
|
||||
|
||||
// now we have to check all WHOLE periods of the indel sequence:
|
||||
// for instance, if
|
||||
// REF: AGCTATATATAGCC
|
||||
// READ: GCTAT***TAGCC
|
||||
// the deleted sequence ATA does have period of 2, but deletion obviously can not be
|
||||
// shifted left by 2 bases (length 3 does not contain whole number of periods of 2);
|
||||
// however if 4 bases are deleted:
|
||||
// REF: AGCTATATATAGCC
|
||||
// READ: GCTA****TAGCC
|
||||
// the length 4 is a multiple of the period of 2, and indeed deletion site can be moved left by 2 bases!
|
||||
// Also, we will always have to check the length of the indel sequence itself (trivial period). If the smallest
|
||||
// period is 1 (which means that indel sequence is a homo-nucleotide sequence), we obviously do not have to check
|
||||
// any other periods.
|
||||
|
||||
// NOTE: we treat both insertions and deletions in the same way below: we always check if the indel sequence
|
||||
// repeats itsels on the REF (never on the read!), even for insertions: if we see TA inserted and REF has, e.g., CATATA prior to the insertion
|
||||
// position, we will move insertion left, to the position right after CA. This way, while moving the indel across the repeat
|
||||
// on the ref, we can theoretically move it across a non-repeat on the read if the latter has a mismtach.
|
||||
|
||||
while ( period < indel_length ) { // we will always get at least trivial period = indelStringLength
|
||||
|
||||
period = BaseUtils.sequencePeriod(indelString, period+1);
|
||||
|
||||
if ( indel_length % period != 0 ) continue; // if indel sequence length is not a multiple of the period, it's not gonna work
|
||||
|
||||
int newIndex = indelIndexOnRef;
|
||||
|
||||
while ( newIndex >= period ) { // let's see if there is a repeat, i.e. if we could also say that same bases at lower position are deleted
|
||||
|
||||
// lets check if bases [newIndex-period,newIndex) immediately preceding the indel on the ref
|
||||
// are the same as the currently checked period of the inserted sequence:
|
||||
|
||||
boolean match = true;
|
||||
|
||||
for ( int testRefPos = newIndex - period, indelPos = 0 ; testRefPos < newIndex; testRefPos++, indelPos++) {
|
||||
byte indelChr = indelString[indelPos];
|
||||
if ( refSeq[testRefPos] != indelChr || !BaseUtils.isRegularBase((char)indelChr) ) {
|
||||
match = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( match ) {
|
||||
newIndex -= period; // yes, they are the same, we can move indel farther left by at least period bases, go check if we can do more...
|
||||
}
|
||||
else {
|
||||
break; // oops, no match, can not push indel farther left
|
||||
}
|
||||
}
|
||||
|
||||
final int newDifference = indelIndexOnRef - newIndex;
|
||||
if ( newDifference > difference ) difference = newDifference; // deletion should be moved 'difference' bases left
|
||||
|
||||
if ( period == 1 ) break; // we do not have to check all periods of homonucleotide sequences, we already
|
||||
// got maximum possible shift after checking period=1 above.
|
||||
}
|
||||
|
||||
// if ( ce2.getLength() >= 2 )
|
||||
// System.out.println("-----------------------------------\n FROM:\n"+AlignmentUtils.alignmentToString(cigar,readSeq,refSeq,refIndex, (readIsConsensusSequence?refIndex:0)));
|
||||
|
||||
|
||||
if ( difference > 0 ) {
|
||||
|
||||
// The following if() statement: this should've never happened, unless the alignment is really screwed up.
|
||||
// A real life example:
|
||||
//
|
||||
// ref: TTTTTTTTTTTTTTTTTT******TTTTTACTTATAGAAGAAAT...
|
||||
// read: GTCTTTTTTTTTTTTTTTTTTTTTTTACTTATAGAAGAAAT...
|
||||
//
|
||||
// i.e. the alignment claims 6 T's to be inserted. The alignment is clearly malformed/non-conforming since we could
|
||||
// have just 3 T's inserted (so that the beginning of the read maps right onto the beginning of the
|
||||
// reference fragment shown): that would leave us with same 2 mismatches at the beginning of the read
|
||||
// (G and C) but lower gap penalty. Note that this has nothing to do with the alignment being "right" or "wrong"
|
||||
// with respect to where on the DNA the read actually came from. It is the assumptions of *how* the alignments are
|
||||
// built and represented that are broken here. While it is unclear how the alignment shown above could be generated
|
||||
// in the first place, we are not in the business of fixing incorrect alignments in this method; all we are
|
||||
// trying to do is to left-adjust correct ones. So if something like that happens, we refuse to change the cigar
|
||||
// and bail out.
|
||||
if ( ce1.getLength()-difference < 0 ) return cigar;
|
||||
|
||||
Cigar newCigar = new Cigar();
|
||||
// do not add leading M cigar element if its length is zero (i.e. if we managed to left-shift the
|
||||
// insertion all the way to the read start):
|
||||
if ( ce1.getLength() - difference > 0 )
|
||||
newCigar.add(new CigarElement(ce1.getLength()-difference, CigarOperator.M));
|
||||
newCigar.add(ce2); // add the indel, now it's left shifted since we decreased the number of preceding matching bases
|
||||
|
||||
if ( cigar.numCigarElements() > 2 ) {
|
||||
// if we got something following the indel element:
|
||||
|
||||
if ( cigar.getCigarElement(2).getOperator() == CigarOperator.M ) {
|
||||
// if indel was followed by matching bases (that's the most common situation),
|
||||
// increase the length of the matching section after the indel by the amount of left shift
|
||||
// (matching bases that were on the left are now *after* the indel; we have also checked at the beginning
|
||||
// that the first cigar element was also M):
|
||||
newCigar.add(new CigarElement(cigar.getCigarElement(2).getLength()+difference, CigarOperator.M));
|
||||
} else {
|
||||
// if the element after the indel was not M, we have to add just the matching bases that were on the left
|
||||
// and now appear after the indel after we performed the shift. Then add the original element that followed the indel.
|
||||
newCigar.add(new CigarElement(difference, CigarOperator.M));
|
||||
newCigar.add(new CigarElement(cigar.getCigarElement(2).getLength(),cigar.getCigarElement(2).getOperator()));
|
||||
}
|
||||
// now add remaining (unchanged) cigar elements, if any:
|
||||
for ( int i = 3 ; i < cigar.numCigarElements() ; i++ ) {
|
||||
newCigar.add(new CigarElement(cigar.getCigarElement(i).getLength(),cigar.getCigarElement(i).getOperator()));
|
||||
}
|
||||
}
|
||||
|
||||
//logger.debug("Realigning indel: " + AlignmentUtils.cigarToString(cigar) + " to " + AlignmentUtils.cigarToString(newCigar));
|
||||
cigar = newCigar;
|
||||
|
||||
}
|
||||
return cigar;
|
||||
}
|
||||
|
||||
private class AlignedRead {
|
||||
private final SAMRecord read;
|
||||
private Cigar newCigar = null;
|
||||
|
|
|
|||
|
|
@ -379,4 +379,165 @@ public class AlignmentUtils {
|
|||
|
||||
return BaseUtils.reverse(read.getBaseQualities());
|
||||
}
|
||||
|
||||
/** Takes the alignment of the read sequence <code>readSeq</code> to the reference sequence <code>refSeq</code>
|
||||
* starting at 0-based position <code>refIndex</code> on the <code>refSeq</code> and specified by its <code>cigar</code>.
|
||||
* The last argument <code>readIndex</code> specifies 0-based position on the read where the alignment described by the
|
||||
* <code>cigar</code> starts. Usually cigars specify alignments of the whole read to the ref, so that readIndex is normally 0.
|
||||
* Use non-zero readIndex only when the alignment cigar represents alignment of a part of the read. The refIndex in this case
|
||||
* should be the position where the alignment of that part of the read starts at. In other words, both refIndex and readIndex are
|
||||
* always the positions where the cigar starts on the ref and on the read, respectively.
|
||||
*
|
||||
* If the alignment has an indel, then this method attempts moving this indel left across a stretch of repetitive bases. For instance, if the original cigar
|
||||
* specifies that (any) one AT is deleted from a repeat sequence TATATATA, the output cigar will always mark the leftmost AT
|
||||
* as deleted. If there is no indel in the original cigar, or the indel position is determined unambiguously (i.e. inserted/deleted sequence
|
||||
* is not repeated), the original cigar is returned.
|
||||
* @param cigar structure of the original alignment
|
||||
* @param refSeq reference sequence the read is aligned to
|
||||
* @param readSeq read sequence
|
||||
* @param refIndex 0-based alignment start position on ref
|
||||
* @param readIndex 0-based alignment start position on read
|
||||
* @return a cigar, in which indel is guaranteed to be placed at the leftmost possible position across a repeat (if any)
|
||||
*/
|
||||
public static Cigar leftAlignIndel(Cigar cigar, final byte[] refSeq, final byte[] readSeq, final int refIndex, final int readIndex) {
|
||||
if ( cigar.numCigarElements() < 2 ) return cigar; // no indels, nothing to do
|
||||
|
||||
final CigarElement ce1 = cigar.getCigarElement(0);
|
||||
final CigarElement ce2 = cigar.getCigarElement(1);
|
||||
|
||||
// we currently can not handle clipped reads; alternatively, if the alignment starts from insertion, there
|
||||
// is no place on the read to move that insertion further left; so we are done:
|
||||
if ( ce1.getOperator() != CigarOperator.M ) return cigar;
|
||||
|
||||
int difference = 0; // we can move indel 'difference' bases left
|
||||
final int indel_length = ce2.getLength();
|
||||
|
||||
int period = 0; // period of the inserted/deleted sequence
|
||||
int indelIndexOnRef = refIndex+ce1.getLength() ; // position of the indel on the REF (first deleted base or first base after insertion)
|
||||
int indelIndexOnRead = readIndex+ce1.getLength(); // position of the indel on the READ (first insterted base, of first base after deletion)
|
||||
|
||||
byte[] indelString = new byte[ce2.getLength()]; // inserted or deleted sequence
|
||||
|
||||
if ( ce2.getOperator() == CigarOperator.D )
|
||||
System.arraycopy(refSeq, indelIndexOnRef, indelString, 0, ce2.getLength());
|
||||
else if ( ce2.getOperator() == CigarOperator.I )
|
||||
System.arraycopy(readSeq, indelIndexOnRead, indelString, 0, ce2.getLength());
|
||||
else
|
||||
// we can get here if there is soft clipping done at the beginning of the read
|
||||
// for now, we'll just punt the issue and not try to realign these
|
||||
return cigar;
|
||||
|
||||
// now we have to check all WHOLE periods of the indel sequence:
|
||||
// for instance, if
|
||||
// REF: AGCTATATATAGCC
|
||||
// READ: GCTAT***TAGCC
|
||||
// the deleted sequence ATA does have period of 2, but deletion obviously can not be
|
||||
// shifted left by 2 bases (length 3 does not contain whole number of periods of 2);
|
||||
// however if 4 bases are deleted:
|
||||
// REF: AGCTATATATAGCC
|
||||
// READ: GCTA****TAGCC
|
||||
// the length 4 is a multiple of the period of 2, and indeed deletion site can be moved left by 2 bases!
|
||||
// Also, we will always have to check the length of the indel sequence itself (trivial period). If the smallest
|
||||
// period is 1 (which means that indel sequence is a homo-nucleotide sequence), we obviously do not have to check
|
||||
// any other periods.
|
||||
|
||||
// NOTE: we treat both insertions and deletions in the same way below: we always check if the indel sequence
|
||||
// repeats itsels on the REF (never on the read!), even for insertions: if we see TA inserted and REF has, e.g., CATATA prior to the insertion
|
||||
// position, we will move insertion left, to the position right after CA. This way, while moving the indel across the repeat
|
||||
// on the ref, we can theoretically move it across a non-repeat on the read if the latter has a mismtach.
|
||||
|
||||
while ( period < indel_length ) { // we will always get at least trivial period = indelStringLength
|
||||
|
||||
period = BaseUtils.sequencePeriod(indelString, period+1);
|
||||
|
||||
if ( indel_length % period != 0 ) continue; // if indel sequence length is not a multiple of the period, it's not gonna work
|
||||
|
||||
int newIndex = indelIndexOnRef;
|
||||
|
||||
while ( newIndex >= period ) { // let's see if there is a repeat, i.e. if we could also say that same bases at lower position are deleted
|
||||
|
||||
// lets check if bases [newIndex-period,newIndex) immediately preceding the indel on the ref
|
||||
// are the same as the currently checked period of the inserted sequence:
|
||||
|
||||
boolean match = true;
|
||||
|
||||
for ( int testRefPos = newIndex - period, indelPos = 0 ; testRefPos < newIndex; testRefPos++, indelPos++) {
|
||||
byte indelChr = indelString[indelPos];
|
||||
if ( refSeq[testRefPos] != indelChr || !BaseUtils.isRegularBase((char)indelChr) ) {
|
||||
match = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( match ) {
|
||||
newIndex -= period; // yes, they are the same, we can move indel farther left by at least period bases, go check if we can do more...
|
||||
}
|
||||
else {
|
||||
break; // oops, no match, can not push indel farther left
|
||||
}
|
||||
}
|
||||
|
||||
final int newDifference = indelIndexOnRef - newIndex;
|
||||
if ( newDifference > difference ) difference = newDifference; // deletion should be moved 'difference' bases left
|
||||
|
||||
if ( period == 1 ) break; // we do not have to check all periods of homonucleotide sequences, we already
|
||||
// got maximum possible shift after checking period=1 above.
|
||||
}
|
||||
|
||||
// if ( ce2.getLength() >= 2 )
|
||||
// System.out.println("-----------------------------------\n FROM:\n"+AlignmentUtils.alignmentToString(cigar,readSeq,refSeq,refIndex, (readIsConsensusSequence?refIndex:0)));
|
||||
|
||||
|
||||
if ( difference > 0 ) {
|
||||
|
||||
// The following if() statement: this should've never happened, unless the alignment is really screwed up.
|
||||
// A real life example:
|
||||
//
|
||||
// ref: TTTTTTTTTTTTTTTTTT******TTTTTACTTATAGAAGAAAT...
|
||||
// read: GTCTTTTTTTTTTTTTTTTTTTTTTTACTTATAGAAGAAAT...
|
||||
//
|
||||
// i.e. the alignment claims 6 T's to be inserted. The alignment is clearly malformed/non-conforming since we could
|
||||
// have just 3 T's inserted (so that the beginning of the read maps right onto the beginning of the
|
||||
// reference fragment shown): that would leave us with same 2 mismatches at the beginning of the read
|
||||
// (G and C) but lower gap penalty. Note that this has nothing to do with the alignment being "right" or "wrong"
|
||||
// with respect to where on the DNA the read actually came from. It is the assumptions of *how* the alignments are
|
||||
// built and represented that are broken here. While it is unclear how the alignment shown above could be generated
|
||||
// in the first place, we are not in the business of fixing incorrect alignments in this method; all we are
|
||||
// trying to do is to left-adjust correct ones. So if something like that happens, we refuse to change the cigar
|
||||
// and bail out.
|
||||
if ( ce1.getLength()-difference < 0 ) return cigar;
|
||||
|
||||
Cigar newCigar = new Cigar();
|
||||
// do not add leading M cigar element if its length is zero (i.e. if we managed to left-shift the
|
||||
// insertion all the way to the read start):
|
||||
if ( ce1.getLength() - difference > 0 )
|
||||
newCigar.add(new CigarElement(ce1.getLength()-difference, CigarOperator.M));
|
||||
newCigar.add(ce2); // add the indel, now it's left shifted since we decreased the number of preceding matching bases
|
||||
|
||||
if ( cigar.numCigarElements() > 2 ) {
|
||||
// if we got something following the indel element:
|
||||
|
||||
if ( cigar.getCigarElement(2).getOperator() == CigarOperator.M ) {
|
||||
// if indel was followed by matching bases (that's the most common situation),
|
||||
// increase the length of the matching section after the indel by the amount of left shift
|
||||
// (matching bases that were on the left are now *after* the indel; we have also checked at the beginning
|
||||
// that the first cigar element was also M):
|
||||
newCigar.add(new CigarElement(cigar.getCigarElement(2).getLength()+difference, CigarOperator.M));
|
||||
} else {
|
||||
// if the element after the indel was not M, we have to add just the matching bases that were on the left
|
||||
// and now appear after the indel after we performed the shift. Then add the original element that followed the indel.
|
||||
newCigar.add(new CigarElement(difference, CigarOperator.M));
|
||||
newCigar.add(new CigarElement(cigar.getCigarElement(2).getLength(),cigar.getCigarElement(2).getOperator()));
|
||||
}
|
||||
// now add remaining (unchanged) cigar elements, if any:
|
||||
for ( int i = 3 ; i < cigar.numCigarElements() ; i++ ) {
|
||||
newCigar.add(new CigarElement(cigar.getCigarElement(i).getLength(),cigar.getCigarElement(i).getOperator()));
|
||||
}
|
||||
}
|
||||
|
||||
//logger.debug("Realigning indel: " + AlignmentUtils.cigarToString(cigar) + " to " + AlignmentUtils.cigarToString(newCigar));
|
||||
cigar = newCigar;
|
||||
|
||||
}
|
||||
return cigar;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue