From 5c889580c4b3703e18b8726c63996dd13e511408 Mon Sep 17 00:00:00 2001 From: asivache Date: Fri, 13 May 2011 21:12:54 +0000 Subject: [PATCH] Change of logic: if "read" (sequence 2) sticks out beyond the boundary of the ref (sequence 1) it is aligned to, the extra bases on the left or on the right will be softclipped in the cigar generated for such an alignment, rather than added to the firts/last M block. This also affects alignment offset: if read starts before the ref (used to be represented by a negative offset), the cigar now will start with S, and the returned offset (alignment start) will be 0. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5802 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/utils/SWPairwiseAlignment.java | 72 +++++++++++++------ 1 file changed, 51 insertions(+), 21 deletions(-) diff --git a/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java b/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java index 1f1adca72..4b7fa3e41 100755 --- a/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java +++ b/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java @@ -52,8 +52,10 @@ public class SWPairwiseAlignment { private static final int MSTATE = 0; private static final int ISTATE = 1; private static final int DSTATE = 2; + private static final int CLIP = 3; private static boolean cutoff = false; + private static boolean DO_SOFTCLIP = true; double[] SW; @@ -276,12 +278,17 @@ public class SWPairwiseAlignment { } // System.out.println(" Found max score="+maxscore+" at p1="+p1+ " p2="+p2); + List lce = new ArrayList(5); + + if ( segment_length > 0 && DO_SOFTCLIP ) { + lce.add(makeElement(CLIP, segment_length)); + segment_length = 0; + } // we will be placing all insertions and deletions into sequence b, so the states are named w/regard // to that sequence int state = MSTATE; - List lce = new ArrayList(5); int data_offset = p1*(m+1)+p2; // offset of element [p1][p2] // System.out.println("Backtracking: starts at "+p1+":"+p2+" ("+sw[data_offset]+")"); @@ -331,23 +338,34 @@ public class SWPairwiseAlignment { // post-process the last segment we are still keeping; // NOTE: if reads "overhangs" the ref on the left (i.e. if p2>0) we are counting - // those extra bases sticking out of the ref into the first cigar element. For instance, + // those extra bases sticking out of the ref into the first cigar element if DO_SOFTCLIP is false; + // otherwise they will be softclipped. For instance, // if read length is 5 and alignment starts at offset -2 (i.e. read starts before the ref, and only - // last 3 bases of the read overlap with/align to the ref), the cigar will be still 5M (and not, e.g. - // 2I3M). The consumers need to check for the alignment offset and deal with it properly. - lce.add(makeElement(state, segment_length + p2)); - alignment_offset = p1 - p2; + // last 3 bases of the read overlap with/align to the ref), the cigar will be still 5M if + // DO_SOFTCLIP is false or 2S3M if DO_SOFTCLIP is true. + // The consumers need to check for the alignment offset and deal with it properly. + if (DO_SOFTCLIP ) { + lce.add(makeElement(state, segment_length)); + if ( p2> 0 ) lce.add(makeElement(CLIP, p2)); + alignment_offset = p1 ; + } else { + lce.add(makeElement(state, segment_length + p2)); + alignment_offset = p1 - p2; + } Collections.reverse(lce); alignmentCigar = new Cigar(lce); + } + private CigarElement makeElement(int state, int segment_length) { CigarOperator o = null; switch(state) { case MSTATE: o = CigarOperator.M; break; case ISTATE: o = CigarOperator.I; break; case DSTATE: o = CigarOperator.D; break; + case CLIP: o = CigarOperator.S; break; } return new CigarElement(segment_length,o); } @@ -447,25 +465,30 @@ public class SWPairwiseAlignment { Cigar cigar = a.getCigar(); + if ( ! DO_SOFTCLIP ) { - if ( offset < 0 ) { - for ( ; j < (-offset) ; j++ ) { - bread.append((char)read[j]); - bref.append(' '); - match.append(' '); - } - // at negative offsets, our cigar's first element carries overhanging bases - // that we have just printed above. Tweak the first element to - // exclude those bases. Here we create a new list of cigar elements, so the original - // list/original cigar are unchanged (they are unmodifiable anyway!) + // we need to go through all the hassle below only if we do not do softclipping; + // otherwise offset is never negative + if ( offset < 0 ) { + for ( ; j < (-offset) ; j++ ) { + bread.append((char)read[j]); + bref.append(' '); + match.append(' '); + } + // at negative offsets, our cigar's first element carries overhanging bases + // that we have just printed above. Tweak the first element to + // exclude those bases. Here we create a new list of cigar elements, so the original + // list/original cigar are unchanged (they are unmodifiable anyway!) - List tweaked = new ArrayList(); - tweaked.addAll(cigar.getCigarElements()); - tweaked.set(0,new CigarElement(cigar.getCigarElement(0).getLength()+offset, + List tweaked = new ArrayList(); + tweaked.addAll(cigar.getCigarElements()); + tweaked.set(0,new CigarElement(cigar.getCigarElement(0).getLength()+offset, cigar.getCigarElement(0).getOperator())); - cigar = new Cigar(tweaked); + cigar = new Cigar(tweaked); + } + } - } else { + if ( offset > 0 ) { // note: the way this implementation works, cigar will ever start from S *only* if read starts before the ref, i.e. offset = 0 for ( ; i < a.getAlignmentStart2wrt1() ; i++ ) { bref.append((char)ref[i]); bread.append(' '); @@ -489,6 +512,13 @@ public class SWPairwiseAlignment { match.append('I'); } break; + case S : + for ( int z = 0 ; z < e.getLength(); z++, j++ ) { + bref.append(' '); + bread.append((char)read[j]); + match.append('S'); + } + break; case D: for ( int z = 0 ; z < e.getLength(); z++ , i++ ) { bref.append((char)ref[i]);