Massive change to the indel realigner code. We now properly deal with soft-clipped reads. Also, improved left-alignment code.

Small change for Ryan to get hard-clipped reads working for the recalibrator.

PLEASE DO NOT RELEASE THIS WEEK.  I still have some more testing to do and need Mark to run WG jobs.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3430 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-05-25 20:04:33 +00:00
parent f3e2aae570
commit 772f558ae0
2 changed files with 309 additions and 117 deletions

View File

@ -315,9 +315,9 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START ) {
readsNotToClean.add(read);
} else {
readsToClean.add(read, ref.getBasesAsChars());
readsToClean.add(read, ref.getBases());
// add the rods to the list of known variants
populateKnownIndels(metaDataTracker, null); // todo -- fixme!
populateKnownIndels(metaDataTracker, null);
}
if ( readsToClean.size() + readsNotToClean.size() >= MAX_READS ) {
@ -420,8 +420,8 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
}
private static int mismatchQualitySumIgnoreCigar(final AlignedRead aRead, final byte[] refSeq, int refIndex, int quitAboveThisValue) {
final byte[] readSeq = aRead.getRead().getReadBases();
final byte[] quals = aRead.getRead().getBaseQualities();
final byte[] readSeq = aRead.getReadBases();
final byte[] quals = aRead.getBaseQualities();
int sum = 0;
for (int readIndex = 0 ; readIndex < readSeq.length ; refIndex++, readIndex++ ) {
if ( refIndex >= refSeq.length ) {
@ -432,7 +432,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
} else {
byte refChr = refSeq[refIndex];
byte readChr = readSeq[readIndex];
if ( !BaseUtils.isRegularBase((char)readChr) || !BaseUtils.isRegularBase((char)refChr) )
if ( !BaseUtils.isRegularBase(readChr) || !BaseUtils.isRegularBase(refChr) )
continue; // do not count Ns/Xs/etc ?
if ( readChr != refChr ) {
sum += (int)quals[readIndex];
@ -445,14 +445,6 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
return sum;
}
private static boolean readIsClipped(final SAMRecord read) {
final Cigar c = read.getCigar();
final int n = c.numCigarElements();
if ( c.getCigarElement(n-1).getOperator() == CigarOperator.S ||
c.getCigarElement(0).getOperator() == CigarOperator.S) return true;
return false;
}
private void clean(ReadBin readsToClean) {
final List<SAMRecord> reads = readsToClean.getReads();
@ -466,7 +458,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
final ArrayList<AlignedRead> altReads = new ArrayList<AlignedRead>(); // reads that don't perfectly match
final LinkedList<AlignedRead> altAlignmentsToTest = new LinkedList<AlignedRead>(); // should we try to make an alt consensus from the read?
final Set<Consensus> altConsenses = new LinkedHashSet<Consensus>(); // list of alt consenses
long totalAlignerMismatchSum = 0, totalRawMismatchSum = 0;
long totalRawMismatchSum = 0;
// if there are any known indels for this region, get them
for ( VariantContext knownIndel : knownIndelsToTry.values() ) {
@ -487,8 +479,8 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
// System.out.println(read.getReadString());
// }
// we currently can not deal with clipped reads correctly (or a screwy record)
if ( read.getCigar().numCigarElements() == 0 || readIsClipped(read) ) {
// we can not deal with screwy records
if ( read.getCigar().numCigarElements() == 0 ) {
refReads.add(read);
continue;
}
@ -498,12 +490,11 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
// first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence
int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read);
if ( numBlocks == 2 ) {
Cigar newCigar = AlignmentUtils.leftAlignIndel(read.getCigar(), reference, read.getReadBases(), read.getAlignmentStart()-(int)leftmostIndex, 0);
aRead.setCigar(newCigar);
Cigar newCigar = AlignmentUtils.leftAlignIndel(unclipCigar(read.getCigar()), reference, read.getReadBases(), read.getAlignmentStart()-(int)leftmostIndex, 0);
aRead.setCigar(newCigar, false);
}
final int startOnRef = read.getAlignmentStart()-(int)leftmostIndex;
totalAlignerMismatchSum += AlignmentUtils.mismatchingQualities(aRead.getRead(), reference, startOnRef);
final int rawMismatchScore = mismatchQualitySumIgnoreCigar(aRead, reference, startOnRef, Integer.MAX_VALUE);
// if ( debugOn ) System.out.println("mismatchScore="+mismatchScore);
@ -511,14 +502,16 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
if ( rawMismatchScore > 0 ) {
altReads.add(aRead);
//logger.debug("Adding " + aRead.getRead().getReadName() + " with raw mismatch score " + rawMismatchScore + " to non-ref reads");
if ( !read.getDuplicateReadFlag() )
totalRawMismatchSum += rawMismatchScore;
aRead.setMismatchScoreToReference(rawMismatchScore);
aRead.setAlignerMismatchScore(AlignmentUtils.mismatchingQualities(aRead.getRead(), reference, startOnRef));
// if it has an indel, let's see if that's the best consensus
if ( numBlocks == 2 ) {
Consensus c = createAlternateConsensus(startOnRef, aRead.getCigar(), reference, aRead.getRead().getReadBases());
if ( c == null ) {} //System.out.println("ERROR: Failed to create alt consensus for read "+aRead.getRead().getReadName());
else
Consensus c = createAlternateConsensus(startOnRef, aRead.getCigar(), reference, aRead.getReadBases());
if ( c != null )
altConsenses.add(c);
}
@ -539,8 +532,8 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
if ( altAlignmentsToTest.size() <= MAX_READS_FOR_CONSENSUSES ) {
for ( AlignedRead aRead : altAlignmentsToTest ) {
// do a pairwise alignment against the reference
SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getRead().getReadBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND);
Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, aRead.getRead().getReadBases());
SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getReadBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND);
Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, aRead.getReadBases());
if ( c != null ) {
// if ( debugOn ) System.out.println("NEW consensus generated by SW: "+c.str ) ;
altConsenses.add(c);
@ -555,8 +548,8 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
int index = generator.nextInt(altAlignmentsToTest.size());
AlignedRead aRead = altAlignmentsToTest.remove(index);
// do a pairwise alignment against the reference
SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getRead().getReadBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND);
Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, aRead.getRead().getReadBases());
SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getReadBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND);
Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, aRead.getReadBases());
if ( c != null )
altConsenses.add(c);
}
@ -569,7 +562,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
while ( iter.hasNext() ) {
Consensus consensus = iter.next();
//logger.debug("Trying new consensus: " + AlignmentUtils.cigarToString(consensus.cigar) + " " + new String(consensus.str));
//logger.debug("Trying new consensus: " + consensus.cigar + " " + new String(consensus.str));
// if ( DEBUG ) {
// System.out.println("Checking consensus with alignment at "+consensus.positionOnReference+" cigar "+consensus.cigar);
@ -590,14 +583,14 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
// the mismatch score is the min of its alignment vs. the reference and vs. the alternate
int myScore = altAlignment.second;
if ( myScore >= toTest.getMismatchScoreToReference() )
if ( myScore > toTest.getAlignerMismatchScore() || myScore >= toTest.getMismatchScoreToReference() )
myScore = toTest.getMismatchScoreToReference();
// keep track of reads that align better to the alternate consensus.
// By pushing alignments with equal scores to the alternate, it means we'll over-call (het -> hom non ref) but are less likely to under-call (het -> ref, het non ref -> het)
else
consensus.readIndexes.add(new Pair<Integer, Integer>(j, altAlignment.first));
//logger.debug(AlignmentUtils.cigarToString(consensus.cigar) + " vs. " + toTest.getRead().getReadName() + "-" + toTest.getRead().getReadString() + " => " + myScore + " vs. " + altAlignment.first);
//logger.debug(consensus.cigar + " vs. " + toTest.getRead().getReadName() + "-" + toTest.getRead().getReadString() + " => " + myScore + " vs. " + toTest.getMismatchScoreToReference());
if ( !toTest.getRead().getDuplicateReadFlag() )
consensus.mismatchSum += myScore;
@ -613,7 +606,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
if ( bestConsensus != null )
bestConsensus.readIndexes.clear();
bestConsensus = consensus;
//logger.debug("New consensus " + AlignmentUtils.cigarToString(bestConsensus.cigar) + " is now best consensus");
//logger.debug("New consensus " + bestConsensus.cigar + " is now best consensus");
} else {
// we do not need this alt consensus, release memory right away!!
consensus.readIndexes.clear();
@ -626,7 +619,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
// 3) didn't just move around the mismatching columns (i.e. it actually reduces entropy),
// then clean!
final double improvement = (bestConsensus == null ? -1 : ((double)(totalRawMismatchSum - bestConsensus.mismatchSum))/10.0);
if ( improvement >= LOD_THRESHOLD && bestConsensus.mismatchSum <= totalAlignerMismatchSum ) {
if ( improvement >= LOD_THRESHOLD ) {
bestConsensus.cigar = AlignmentUtils.leftAlignIndel(bestConsensus.cigar, reference, bestConsensus.str, bestConsensus.positionOnReference, bestConsensus.positionOnReference);
@ -646,8 +639,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
} catch (Exception e) {}
}
} else {
//System.out.println("CLEAN: " + AlignmentUtils.cigarToString(bestConsensus.cigar) + " " + bestConsensus.str.toString() + " " + bestConsensus.cigar.numCigarElements() );
//logger.debug("CLEAN: " + AlignmentUtils.cigarToString(bestConsensus.cigar) + " " + bestConsensus.str );
//logger.debug("CLEAN: " + bestConsensus.cigar + " " + bestConsensus.str.toString() + " " + bestConsensus.cigar.numCigarElements() );
if ( indelOutput != null && bestConsensus.cigar.numCigarElements() > 1 ) {
// NOTE: indels are printed out in the format specified for the low-coverage pilot1
// indel calls (tab-delimited): chr position size type sequence
@ -705,7 +697,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
} else if ( statsOutput != null ) {
try {
statsOutput.write(String.format("%s\tFAIL\t%.1f\t%d%n",
readsToClean.getLocation().toString(), improvement, bestConsensus.mismatchSum - totalAlignerMismatchSum));
readsToClean.getLocation().toString(), improvement));
statsOutput.flush();
} catch (Exception e) {}
}
@ -720,7 +712,6 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < indexOnRef; i++)
sb.append((char)reference[i]);
//logger.debug("CIGAR = " + AlignmentUtils.cigarToString(c));
int indelCount = 0;
int altIdx = 0;
@ -735,6 +726,8 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
refIdx += elementLength;
break;
case M:
altIdx += elementLength;
case N:
if ( reference.length < refIdx + elementLength )
ok_flag = false;
else {
@ -742,7 +735,6 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
sb.append((char)reference[refIdx+j]);
}
refIdx += elementLength;
altIdx += elementLength;
break;
case I:
for (int j = 0; j < elementLength; j++) {
@ -756,6 +748,9 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
altIdx += elementLength;
indelCount++;
break;
case S:
default:
break;
}
}
// make sure that there is at most only a single indel and it aligns appropriately!
@ -867,7 +862,8 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
else {
if ( altCE1.getOperator() != CigarOperator.M )
throw new StingException("First element of the alt consensus cigar must be M or I. Actual: "+altCigar.toString());
if ( altCE2.getOperator() == CigarOperator.I || altCE2.getOperator() == CigarOperator.D ) indelCE=altCE2;
if ( altCE2.getOperator() == CigarOperator.I || altCE2.getOperator() == CigarOperator.D )
indelCE=altCE2;
else
throw new StingException("When first element of the alt consensus is M, the second one must be I or D. Actual: "+altCigar.toString());
leadingMatchingBlockLength = altCE1.getLength();
@ -925,7 +921,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
return;
}
int readRemaining = aRead.getReadLength();
int readRemaining = aRead.getReadBases().length;
for ( CigarElement ce : readCigar.getCigarElements() ) {
if ( ce.getOperator() != CigarOperator.D )
readRemaining -= ce.getLength();
@ -951,8 +947,8 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
continue;
int refIdx = read.getOriginalAlignmentStart() - (int)leftmostIndex;
final byte[] readStr = read.getRead().getReadBases();
final byte[] quals = read.getRead().getBaseQualities();
final byte[] readStr = read.getReadBases();
final byte[] quals = read.getBaseQualities();
for (int j=0; j < readStr.length; j++, refIdx++ ) {
if ( refIdx < 0 || refIdx >= reference.length ) {
@ -988,8 +984,10 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
case D:
refIdx += elementLength;
break;
case S:
default:
break;
}
}
}
@ -1033,11 +1031,27 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
return reduces;
}
private static Cigar unclipCigar(Cigar cigar) {
ArrayList<CigarElement> elements = new ArrayList<CigarElement>(cigar.numCigarElements());
for ( CigarElement ce : cigar.getCigarElements() ) {
if ( !isClipOperator(ce.getOperator()) )
elements.add(ce);
}
return new Cigar(elements);
}
private static boolean isClipOperator(CigarOperator op) {
return op == CigarOperator.S || op == CigarOperator.H || op == CigarOperator.P;
}
private class AlignedRead {
private final SAMRecord read;
private byte[] readBases = null;
private byte[] baseQuals = null;
private Cigar newCigar = null;
private int newStart = -1;
private int mismatchScoreToReference;
private int mismatchScoreToReference = 0;
private long alignerMismatchScore = 0;
public AlignedRead(SAMRecord read) {
this.read = read;
@ -1049,25 +1063,101 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
}
public int getReadLength() {
return read.getReadLength();
return readBases != null ? readBases.length : read.getReadLength();
}
public byte[] getReadBases() {
if ( readBases == null )
getUnclippedBases();
return readBases;
}
public byte[] getBaseQualities() {
if ( baseQuals == null )
getUnclippedBases();
return baseQuals;
}
// pull out the bases that aren't clipped out
private void getUnclippedBases() {
readBases = new byte[getReadLength()];
baseQuals = new byte[getReadLength()];
byte[] actualReadBases = read.getReadBases();
byte[] actualBaseQuals = read.getBaseQualities();
int fromIndex = 0, toIndex = 0;
for ( CigarElement ce : read.getCigar().getCigarElements() ) {
int elementLength = ce.getLength();
switch ( ce.getOperator() ) {
case S:
fromIndex += elementLength;
break;
case M:
case I:
System.arraycopy(actualReadBases, fromIndex, readBases, toIndex, elementLength);
System.arraycopy(actualBaseQuals, fromIndex, baseQuals, toIndex, elementLength);
fromIndex += elementLength;
toIndex += elementLength;
default:
break;
}
}
// if we got clipped, trim the array
if ( fromIndex != toIndex ) {
byte[] trimmedRB = new byte[toIndex];
byte[] trimmedBQ = new byte[toIndex];
System.arraycopy(readBases, 0, trimmedRB, 0, toIndex);
System.arraycopy(baseQuals, 0, trimmedBQ, 0, toIndex);
readBases = trimmedRB;
baseQuals = trimmedBQ;
}
}
public Cigar getCigar() {
return (newCigar != null ? newCigar : read.getCigar());
}
// tentatively sets the new Cigar, but it needs to be confirmed later
// returns true if the new cigar is a valid change (i.e. not same as original and doesn't remove indel)
public boolean setCigar(Cigar cigar) {
if ( getCigar().equals(cigar) )
return false;
public void setCigar(Cigar cigar) {
setCigar(cigar, true);
}
String str = AlignmentUtils.cigarToString(cigar);
// tentatively sets the new Cigar, but it needs to be confirmed later
public void setCigar(Cigar cigar, boolean fixClippedCigar) {
if ( fixClippedCigar && getReadBases().length < read.getReadLength() )
cigar = reclipCigar(cigar);
// no change?
if ( getCigar().equals(cigar) )
return;
// no indel?
String str = cigar.toString();
if ( !str.contains("D") && !str.contains("I") )
return false;
return;
newCigar = cigar;
return true;
}
// pull out the bases that aren't clipped out
private Cigar reclipCigar(Cigar cigar) {
ArrayList<CigarElement> elements = new ArrayList<CigarElement>();
int i = 0;
int n = read.getCigar().numCigarElements();
while ( i < n && isClipOperator(read.getCigar().getCigarElement(i).getOperator()) )
elements.add(read.getCigar().getCigarElement(i++));
elements.addAll(cigar.getCigarElements());
i++;
while ( i < n && !isClipOperator(read.getCigar().getCigarElement(i).getOperator()) )
i++;
while ( i < n && isClipOperator(read.getCigar().getCigarElement(i).getOperator()) )
elements.add(read.getCigar().getCigarElement(i++));
return new Cigar(elements);
}
// tentatively sets the new start, but it needs to be confirmed later
@ -1120,6 +1210,14 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
public int getMismatchScoreToReference() {
return mismatchScoreToReference;
}
public void setAlignerMismatchScore(long score) {
alignerMismatchScore = score;
}
public long getAlignerMismatchScore() {
return alignerMismatchScore;
}
}
private class Consensus {
@ -1155,12 +1253,12 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
private class ReadBin {
private final ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
private char[] reference = null;
private byte[] reference = null;
private GenomeLoc loc = null;
public ReadBin() { }
public void add(SAMRecord read, char[] ref) {
public void add(SAMRecord read, byte[] ref) {
reads.add(read);
// set up the reference
@ -1173,7 +1271,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
if ( neededBases > ref.length )
throw new StingException("Read " + read.getReadName() + " does not overlap the previous read in this interval; please ensure that you are using the same input bam that was used in the RealignerTargetCreator step");
if ( neededBases > 0 ) {
char[] newReference = new char[reference.length + neededBases];
byte[] newReference = new byte[reference.length + neededBases];
System.arraycopy(reference, 0, newReference, 0, reference.length);
System.arraycopy(ref, ref.length-neededBases, newReference, reference.length, neededBases);
reference = newReference;
@ -1185,14 +1283,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
public List<SAMRecord> getReads() { return reads; }
public byte[] getRereference() {
// upper case it
for ( int i = 0; i < reference.length; i++ )
reference[i] = Character.toUpperCase(reference[i]);
// convert it to a byte array
byte[] refArray = new byte[reference.length];
StringUtil.charsToBytes(reference, 0, reference.length, refArray, 0);
return refArray;
return reference;
}
public GenomeLoc getLocation() { return loc; }

View File

@ -33,9 +33,11 @@ import net.sf.samtools.util.StringUtil;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.pileup.*;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.Utils;
import java.util.ArrayList;
import java.util.Arrays;
public class AlignmentUtils {
@ -230,58 +232,6 @@ public class AlignmentUtils {
return n;
}
public static String alignmentToString(final Cigar cigar,final String seq, final String ref, final int posOnRef ) {
return alignmentToString( cigar, seq, ref, posOnRef, 0 );
}
public static String cigarToString(Cigar cig) {
return cig.toString();
}
public static String alignmentToString(final Cigar cigar,final String seq, final String ref, final int posOnRef, final int posOnRead ) {
int readPos = posOnRead;
int refPos = posOnRef;
StringBuilder refLine = new StringBuilder();
StringBuilder readLine = new StringBuilder();
for ( int i = 0 ; i < posOnRead ; i++ ) {
refLine.append( ref.charAt( refPos - readPos + i ) );
readLine.append( seq.charAt(i) ) ;
}
for ( int i = 0 ; i < cigar.numCigarElements() ; i++ ) {
final CigarElement ce = cigar.getCigarElement(i);
switch(ce.getOperator()) {
case I:
for ( int j = 0 ; j < ce.getLength(); j++ ) {
refLine.append('+');
readLine.append( seq.charAt( readPos++ ) );
}
break;
case D:
for ( int j = 0 ; j < ce.getLength(); j++ ) {
readLine.append('*');
refLine.append( ref.charAt( refPos++ ) );
}
break;
case M:
for ( int j = 0 ; j < ce.getLength(); j++ ) {
refLine.append(ref.charAt( refPos++ ) );
readLine.append( seq.charAt( readPos++ ) );
}
break;
default: throw new StingException("Unsupported cigar operator: "+ce.getOperator() );
}
}
refLine.append('\n');
refLine.append(readLine);
refLine.append('\n');
return refLine.toString();
}
public static char[] alignmentToCharArray( final Cigar cigar, final char[] read, final char[] ref ) {
final char[] alignment = new char[read.length];
@ -291,11 +241,12 @@ public class AlignmentUtils {
for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) {
final CigarElement ce = cigar.getCigarElement(iii);
final int elementLength = ce.getLength();
switch( ce.getOperator() ) {
case I:
case S:
for ( int jjj = 0 ; jjj < ce.getLength(); jjj++ ) {
for ( int jjj = 0 ; jjj < elementLength; jjj++ ) {
alignment[alignPos++] = '+';
}
break;
@ -304,12 +255,15 @@ public class AlignmentUtils {
refPos++;
break;
case M:
for ( int jjj = 0 ; jjj < ce.getLength(); jjj++ ) {
for ( int jjj = 0 ; jjj < elementLength; jjj++ ) {
alignment[alignPos] = ref[refPos];
alignPos++;
refPos++;
}
break;
case H:
case P:
break;
default:
throw new StingException( "Unsupported cigar operator: " + ce.getOperator() );
}
@ -372,6 +326,152 @@ public class AlignmentUtils {
* @return a cigar, in which indel is guaranteed to be placed at the leftmost possible position across a repeat (if any)
*/
public static Cigar leftAlignIndel(Cigar cigar, final byte[] refSeq, final byte[] readSeq, final int refIndex, final int readIndex) {
int indexOfIndel = -1;
for ( int i = 0; i < cigar.numCigarElements(); i++ ) {
CigarElement ce = cigar.getCigarElement(i);
if ( ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I ) {
// if there is more than 1 indel, don't left align
if ( indexOfIndel != -1 )
return cigar;
indexOfIndel = i;
}
}
// if there is no indel or if the alignment starts with an insertion (so that there
// is no place on the read to move that insertion further left), we are done
if ( indexOfIndel < 1 ) return cigar;
final int indelLength = cigar.getCigarElement(indexOfIndel).getLength();
byte[] altString = createIndelString(cigar, indexOfIndel, refSeq, readSeq, refIndex, readIndex);
Cigar newCigar = cigar;
for ( int i = 0; i < indelLength; i++ ) {
newCigar = moveCigarLeft(newCigar, indexOfIndel);
byte[] newAltString = createIndelString(newCigar, indexOfIndel, refSeq, readSeq, refIndex, readIndex);
// check to make sure we haven't run off the end of the read
boolean reachedEndOfRead = cigarHasZeroSizeElement(newCigar);
if ( Arrays.equals(altString, newAltString) ) {
cigar = newCigar;
i = -1;
if ( reachedEndOfRead )
cigar = cleanUpCigar(cigar);
}
if ( reachedEndOfRead )
break;
}
return cigar;
}
private static boolean cigarHasZeroSizeElement(Cigar c) {
for ( CigarElement ce : c.getCigarElements() ) {
if ( ce.getLength() == 0 )
return true;
}
return false;
}
private static Cigar cleanUpCigar(Cigar c) {
ArrayList<CigarElement> elements = new ArrayList<CigarElement>(c.numCigarElements()-1);
for ( CigarElement ce : c.getCigarElements() ) {
if ( ce.getLength() != 0 &&
(elements.size() != 0 || ce.getOperator() != CigarOperator.D) ) {
elements.add(ce);
}
}
return new Cigar(elements);
}
private static Cigar moveCigarLeft(Cigar cigar, int indexOfIndel) {
// get the first few elements
ArrayList<CigarElement> elements = new ArrayList<CigarElement>(cigar.numCigarElements());
for ( int i = 0; i < indexOfIndel - 1; i++)
elements.add(cigar.getCigarElement(i));
// get the indel element and move it left one base
CigarElement ce = cigar.getCigarElement(indexOfIndel-1);
elements.add(new CigarElement(ce.getLength()-1, ce.getOperator()));
elements.add(cigar.getCigarElement(indexOfIndel));
ce = cigar.getCigarElement(indexOfIndel+1);
elements.add(new CigarElement(ce.getLength()+1, ce.getOperator()));
// get the last few elements
for ( int i = indexOfIndel + 2; i < cigar.numCigarElements(); i++)
elements.add(cigar.getCigarElement(i));
return new Cigar(elements);
}
private static byte[] createIndelString(final Cigar cigar, final int indexOfIndel, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) {
CigarElement indel = cigar.getCigarElement(indexOfIndel);
int indelLength = indel.getLength();
// the indel-based reference string
byte[] alt = new byte[refSeq.length + (indelLength * (indel.getOperator() == CigarOperator.D ? -1 : 1))];
for ( int i = 0; i < indexOfIndel; i++ ) {
CigarElement ce = cigar.getCigarElement(i);
int length = ce.getLength();
switch( ce.getOperator() ) {
case M:
readIndex += length;
refIndex += length;
break;
case S:
readIndex += length;
break;
case N:
refIndex += length;
break;
default:
break;
}
}
// add the bases before the indel
System.arraycopy(refSeq, 0, alt, 0, refIndex);
int currentPos = refIndex;
// take care of the indel
if ( indel.getOperator() == CigarOperator.D ) {
refIndex += indelLength;
} else {
System.arraycopy(readSeq, readIndex, alt, currentPos, indelLength);
currentPos += indelLength;
}
// add the bases after the indel
System.arraycopy(refSeq, refIndex, alt, currentPos, refSeq.length - refIndex);
return alt;
}
/** Takes the alignment of the read sequence <code>readSeq</code> to the reference sequence <code>refSeq</code>
* starting at 0-based position <code>refIndex</code> on the <code>refSeq</code> and specified by its <code>cigar</code>.
* The last argument <code>readIndex</code> specifies 0-based position on the read where the alignment described by the
* <code>cigar</code> starts. Usually cigars specify alignments of the whole read to the ref, so that readIndex is normally 0.
* Use non-zero readIndex only when the alignment cigar represents alignment of a part of the read. The refIndex in this case
* should be the position where the alignment of that part of the read starts at. In other words, both refIndex and readIndex are
* always the positions where the cigar starts on the ref and on the read, respectively.
*
* If the alignment has an indel, then this method attempts moving this indel left across a stretch of repetitive bases. For instance, if the original cigar
* specifies that (any) one AT is deleted from a repeat sequence TATATATA, the output cigar will always mark the leftmost AT
* as deleted. If there is no indel in the original cigar, or the indel position is determined unambiguously (i.e. inserted/deleted sequence
* is not repeated), the original cigar is returned.
* @param cigar structure of the original alignment
* @param refSeq reference sequence the read is aligned to
* @param readSeq read sequence
* @param refIndex 0-based alignment start position on ref
* @param readIndex 0-based alignment start position on read
* @return a cigar, in which indel is guaranteed to be placed at the leftmost possible position across a repeat (if any)
*/
/*
public static Cigar leftAlignIndelOld(Cigar cigar, final byte[] refSeq, final byte[] readSeq, final int refIndex, final int readIndex) {
if ( cigar.numCigarElements() < 2 ) return cigar; // no indels, nothing to do
final CigarElement ce1 = cigar.getCigarElement(0);
@ -512,4 +612,5 @@ public class AlignmentUtils {
}
return cigar;
}
*/
}