git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@184 348d0f76-0448-11de-a6fe-93d51630548a

This commit is contained in:
asivache 2009-03-25 05:48:10 +00:00
parent 8bdf49a01f
commit 40f45c2333
9 changed files with 632 additions and 160 deletions

View File

@ -0,0 +1,178 @@
package org.broadinstitute.sting.playground.indels;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import edu.mit.broad.picard.reference.ReferenceSequence;
import org.broadinstitute.sting.playground.utils.CountedObject;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Mar 25, 2009
* Time: 12:15:38 AM
* To change this template use File | Settings | File Templates.
*/
public class AlignmentUtils {
/** Computes number of mismatches in the read alignment to the refence <code>ref</code>
* specified in the record <code>r</code>. Indels are completely <i>ignored</i> by this method:
* only base mismatches in the alignment segments where both sequences are present are counted.
* @param r
* @param ref
* @return
*/
public static int numMismatches(SAMRecord r, ReferenceSequence ref) {
if ( r.getReadUnmappedFlag() ) return 1000000;
int i_ref = r.getAlignmentStart()-1; // position on the ref
int i_read = 0; // position on the read
int mm_count = 0; // number of mismatches
Cigar c = r.getCigar();
for ( int k = 0 ; k < c.numCigarElements() ; k++ ) {
CigarElement ce = c.getCigarElement(k);
switch( ce.getOperator() ) {
case M:
for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) {
if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) != 'N' ) continue; // do not count N's ?
if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) !=
Character.toUpperCase((char)ref.getBases()[i_ref]) ) mm_count++;
}
break;
case I: i_read += ce.getLength(); break;
case D: i_ref += ce.getLength(); break;
default: throw new RuntimeException("Unrecognized cigar element");
}
}
return mm_count;
}
public static int numMismatches(SAMRecord r, String ref) {
if ( r.getReadUnmappedFlag() ) return 1000000;
int i_ref = r.getAlignmentStart()-1; // position on the ref
int i_read = 0; // position on the read
int mm_count = 0; // number of mismatches
Cigar c = r.getCigar();
for ( int k = 0 ; k < c.numCigarElements() ; k++ ) {
CigarElement ce = c.getCigarElement(k);
switch( ce.getOperator() ) {
case M:
for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) {
if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) == 'N' ) continue; // do not count N's ?
if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) !=
Character.toUpperCase(ref.charAt(i_ref)) ) mm_count++;
}
break;
case I: i_read += ce.getLength(); break;
case D: i_ref += ce.getLength(); break;
default: throw new RuntimeException("Unrecognized cigar element");
}
}
return mm_count;
}
/** Reads through the alignment cigar and returns all indels found in the alignment as a collection
* of Indel objects.
* @param c
* @param start
* @return
*/
public static Collection<Indel> extractIndels(final Cigar c, final int start) {
//
// firstpos,lastpos span of the indel will be interpreted as follows:
// any alignment that ends strictly before firstpos or starts strictly after lastpos
// on the *reference* (not inclusive!) does not overlap with an indel; in the case of
// insertion it will result in firstpos > lastpos!
// lastpos
// | firstpos
// | |
// v v
// ---------III----- Ref Insertion: bases I are not in the ref; any alignment that starts
// after lastpos or ends before firstpos *on the reference*
// is completely over the reference bases to the right or to
// the left, respectively, of the insertion site
//
// firstpos
// | lastpos
// | |
// v v
//------------------ Ref Deletion: any alignment that ends before firstpos or starts after lastpos
// -----DDD--- alignment on the reference does not overlap with the deletion
int runninglength = start; // position on the original reference; start = alignment start position
List<Indel> indels = new ArrayList<Indel>(4);
if ( c.numCigarElements() <= 1 ) return indels; // most of the reads have no indels, save a few cycles by returning early
for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
final CigarElement ce = c.getCigarElement(i);
Indel curr_indel = null;
switch(ce.getOperator()) {
case I: curr_indel = new Indel(runninglength, ce.getLength(), Indel.IndelType.I); break;
case D: curr_indel = new Indel(runninglength, ce.getLength(), Indel.IndelType.D);
runninglength += ce.getLength();
break;
case M: runninglength += ce.getLength(); break; // advance along the gapless block in the alignment
default :
throw new IllegalArgumentException("Unexpected operator in cigar string");
}
if ( curr_indel == null ) continue; // element was not an indel, go grab next element...
indels.add(curr_indel); // this is a new indel. Add it.
} // end for loop over all alignment cigar elements
return indels;
} // end extractIndels() method
/** Reads through the alignment specified in the record and returns all indels found in the alignment as a collection
* of Indel objects. If read is not mapped, silently returns an empty collection.
* @param r
* @return
*/
public static Collection<Indel> extractIndels(SAMRecord r) {
if ( r.getReadUnmappedFlag() ) return new ArrayList<Indel>();
return extractIndels(r.getCigar(), r.getAlignmentStart());
}
/** Extracts indels from the specified record (@see #extractIndels()) and updates the provided tree object.
* Multiple occurences of the same indel (i.e. with support from multiple reads) will be grouped together
* in one counted object held by the set.
*
* @param r
* @param t
*/
public static void collectAndCountIndels(SAMRecord r, TreeSet<CountedObject<Indel> > t) {
Collection<Indel> indels = AlignmentUtils.extractIndels(r);
for ( Indel ind : indels ) {
CountedObject<Indel> ci = new CountedObject<Indel>(ind);
CountedObject<Indel> found = t.floor(ci);
if ( ci.equals( found ) ) found.increment(); // we did find our indel, advance the counter
else t.add(ci); // this is a new indel. Add it.
}
}
public static String toString(Cigar cig) {
StringBuilder b = new StringBuilder();
for ( int i = 0 ; i < cig.numCigarElements() ; i++ ) {
char c='?';
switch ( cig.getCigarElement(i).getOperator() ) {
case M : c = 'M'; break;
case D : c = 'D'; break;
case I : c = 'I'; break;
}
b.append(c);
b.append(cig.getCigarElement(i).getLength());
}
return b.toString();
}
}

View File

@ -132,7 +132,7 @@ public class ConsensusSequence {
*/
public char baseAt(int offset) {
assert offset >= startOffset && offset < startOffset + coverage.size() : "Offset out of bounds";
int [] cov = coverage.get(offset+(int)referencePos);
int [] cov = coverage.get(offset-startOffset);
int total_cov = cov[0] + cov[1] + cov[2] + cov[3];
int bmax = 0;
char base = 'N';
@ -153,7 +153,7 @@ public class ConsensusSequence {
*/
public Pair<Character,Integer> baseWithCountAt(int offset) {
assert offset >= startOffset && offset < startOffset + coverage.size() : "Offset out of bounds";
int [] cov = coverage.get(offset+(int)referencePos);
int [] cov = coverage.get(offset-startOffset);
int total_cov = cov[0] + cov[1] + cov[2] + cov[3];
int bmax = 0;
char base = 'N';
@ -174,7 +174,7 @@ public class ConsensusSequence {
*/
public int coverageAt(int offset) {
if ( offset < startOffset || offset >= startOffset + coverage.size() ) return 0;
int [] cov = coverage.get(offset+(int)referencePos);
int [] cov = coverage.get(offset-startOffset);
return cov[0]+cov[1]+cov[2]+cov[3];
}

View File

@ -181,33 +181,7 @@ public class IndelInspector extends CommandLineProgram {
return errs;
}
private static int numMismatchesDirect(SAMRecord r, ReferenceSequence ref) {
if ( r.getReadUnmappedFlag() ) return 1000000;
int i_ref = r.getAlignmentStart()-1; // position on the ref
int i_read = 0; // position on the read
int mm_count = 0; // number of mismatches
Cigar c = r.getCigar();
for ( int k = 0 ; k < c.numCigarElements() ; k++ ) {
CigarElement ce = c.getCigarElement(k);
switch( ce.getOperator() ) {
case M:
for ( int l = 0 ; l < ce.getLength() ; l++ ) {
if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) !=
Character.toUpperCase((char)ref.getBases()[i_ref]) ) mm_count++;
i_ref++;
i_read++;
}
break;
case I: i_read += ce.getLength(); break;
case D: i_ref += ce.getLength(); break;
default: throw new RuntimeException("Unrecognized cigar element");
}
}
return mm_count;
}
/** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD */
/** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD */
private static int numMismatches(SAMRecord r) throws RuntimeException {
// NM currently stores the total number of mismatches in all blocks + 1

View File

@ -191,8 +191,7 @@ public class IndelRecordPileCollector implements RecordReceiver {
* @param r
* @throws RuntimeException
*/
@Override
public void receive(final SAMRecord r) throws RuntimeException {
public void receive(final SAMRecord r) throws RuntimeException {
if ( r.getReadUnmappedFlag() ) return; // read did not align, nothing to do

View File

@ -151,8 +151,20 @@ public class MultipleAlignment implements Iterable<Integer> {
return alignment_offsets.get(index.get(id));
}
/** Returns external id of the read the offsets of this multiple alignment are based upon (i.e. all the offsets
* are specified wrt the base read).
* @return
*/
public int getBaseReadId() { return ext_ids.get(0); }
/** Returns offset of the read specified by its external id wrt the start of the consensus sequence in this
* multiple alignment (consenus sequence is a major vote union of all the reads in this alignment).
* @param id
* @return
*/
public int getOffsetWrtConsensus(int id) {
return getOffsetById (id)- consensus.getStartOffset();
}
/** Returns true if the alignment already contains sequence with the specified id.
*
@ -235,49 +247,16 @@ public class MultipleAlignment implements Iterable<Integer> {
if ( seqs.size() == 0 ) return b.toString();
// int first_offset = 0; // offset of the first sequence wrt the start of the MSA
final int first_offset = -consensus.getStartOffset();
// int msa_length = 0;
// for ( int i = 0 ; i < seqs.size() ; i++ ) {
// if ( -alignment_offsets.get(i) > first_offset ) first_offset = -alignment_offsets.get(i);
// msa_length = Math.max( alignment_offsets.get(i)+seqs.get(i).length() , msa_length );
// }
// msa_length += first_offset;
final int msa_length = consensus.length();
// int [] cov = new int[4];
// char[] bases = { 'A' , 'C', 'G', 'T' };
char[][] consensusString = new char[4][msa_length];
for ( int i = -first_offset ; i < msa_length - first_offset ; i++ ) {
// cov[0] = cov[1] = cov[2] = cov[3] = 0;
// for ( int j = 0 ; j < seqs.size(); j++ ) {
// // offset of the sequence j wrt start of the msa region
// int seq_offset = first_offset + alignment_offsets.get(j);
// if ( i < seq_offset || i >= seq_offset + seqs.get(j).length() ) continue; // sequence j has no bases at position i
// int base = -1;
// switch( Character.toUpperCase(seqs.get(j).charAt(i-seq_offset)) ) {
// case 'A': base = 0; break;
// case 'C': base = 1 ; break;
// case 'G': base = 2 ; break;
// case 'T': base = 3 ; break;
// }
// if ( base >= 0 ) cov[base]++;
// }
// int total_cov = cov[0] + cov[1] + cov[2] + cov[3];
// int bmax = 0;
// int mm = 0;
for ( int i = 0 ; i < msa_length ; i++ ) {
Pair<Character,Integer> base = consensus.baseWithCountAt(i-first_offset);
consensusString[3][i] = base.first;
// for ( int z = 0; z < 4 ; z++ ) {
// if ( cov[z] > bmax ) {
// bmax = cov[z];
// consensus[3][i] = bases[z];
// mm = total_cov - bmax;
// }
// }
int mm = consensus.coverageAt(i) - base.second;
int mm = consensus.coverageAt(i-first_offset) - base.second;
if ( mm > 0 ) {
consensusString[2][i] = '*';
if ( mm > 9 ) consensusString[0][i] = Character.forDigit(mm/10,10);
@ -307,6 +286,10 @@ public class MultipleAlignment implements Iterable<Integer> {
return b.toString();
}
public String getConsensus() {
return consensus.getSequence();
}
public String toString() { return toString(true); }
public int size() { return seqs.size(); }
@ -338,7 +321,6 @@ public class MultipleAlignment implements Iterable<Integer> {
}
@Override
public Iterator<Integer> iterator() {
return sequenceIdIterator();
}

View File

@ -128,10 +128,26 @@ public class PairwiseAlignment {
// lets extend these kmer matches and count mismatches:
for ( Integer trial_offset : offsets ) {
int mm_cnt = countMismatches(is1.getSequence(), is2.getSequence(), trial_offset,next_mm);
if ( mm_cnt < best_mm ) {
int mm_cnt = countMismatches(is1.getSequence(), is2.getSequence(), trial_offset,next_mm+1);
// if ( (i1==4||i1==8) && i2==18) {
// if ( i1== 18 ) System.out.print("to " + i2+" : ");
// else System.out.print("to " + i1+" : ");
// System.out.println("offset="+trial_offset.toString()+
// "; mm=" + countMismatches(is1.getSequence(),is2.getSequence(),trial_offset)+
// "(mm_cnt="+mm_cnt+")"+
// "; dist="+distance(is1.getSequence(),is2.getSequence(),trial_offset)+
// "; overlap="+getOverlap(is1.getSequence(),is2.getSequence(),trial_offset));
// }
// save current offset if alignment at this offset has fewer mismatches tham everything we've
// seen so far, or if it has same number of mismatches but has larger overlap (i.e. distance
// between sequences is smaller)
if ( mm_cnt < best_mm ||
( ( mm_cnt == best_mm ) &&
getOverlap(is1.getSequence(),is2.getSequence(),alignment_offset) <
0.8*getOverlap(is1.getSequence(),is2.getSequence(),trial_offset) ) ) {
// if ( (i1==4||i1==8) && i2==18) System.out.println("Saved offset "+trial_offset.toString());
alignment_offset = trial_offset;
next_mm = best_mm;
next_mm = best_mm;
best_mm = mm_cnt;
} else {
if ( mm_cnt < next_mm ) next_mm = mm_cnt;

View File

@ -1,9 +1,13 @@
package org.broadinstitute.sting.playground.indels;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.TextCigarCodec;
import java.util.*;
import org.broadinstitute.sting.utils.PrimitivePair;
import org.broadinstitute.sting.playground.utils.CountedObject;
public class PileBuilder implements RecordPileReceiver {
@ -84,28 +88,35 @@ public class PileBuilder implements RecordPileReceiver {
public void receive(Collection<SAMRecord> c) {
IndexedSequence[] seqs = new IndexedSequence[c.size()];
int i = 0;
int left = 1000000000;
int right = 0;
int startOnRef = 1000000000; // absolute start (leftmost) position of the pile of reads on the ref
int stopOnRef = 0; // absolute stop (rightmost) position of the pile of reads on the ref (rightmost alignment end)
for ( SAMRecord r : c ) {
seqs[i++] = new IndexedSequence(r.getReadString(),KmerSize);
left = Math.min(left, r.getAlignmentStart() );
right = Math.max(right,r.getAlignmentEnd());
startOnRef = Math.min(startOnRef, r.getAlignmentStart() );
stopOnRef = Math.max(stopOnRef,r.getAlignmentEnd());
}
SequencePile originalAligns = new SequencePile(referenceSequence.substring(left-1,right));
// part of the reference covered by original alignments:
String pileRef = referenceSequence.substring(startOnRef-1,stopOnRef);
int totalMismatches = 0; // total mismatches across all reads
TreeSet< CountedObject<Indel> > all_indels = new TreeSet< CountedObject<Indel> >();
SequencePile originalAligns = new SequencePile(pileRef);
for ( SAMRecord r : c ) {
originalAligns.addAlignedSequence(r.getReadString(), r.getReadNegativeStrandFlag(),
r.getCigar(), r.getAlignmentStart() - left );
r.getCigar(), r.getAlignmentStart() - startOnRef );
totalMismatches += AlignmentUtils.numMismatches(r,referenceSequence);
AlignmentUtils.collectAndCountIndels(r,all_indels);
}
System.out.println("\n#############################################################################");
System.out.println("ORIGINAL ALIGNMENT: \n");
originalAligns.colorprint(true);
originalAligns.dotprint(true);
System.out.println("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") ;
List<MultipleAlignment> piles = doMultipleAlignment2(seqs);
// System.out.print("Distance between final piles: "+distance(alignments1, alignments2));
// System.out.print("; diameter of PILE1: "+ diameter(alignments1));
// System.out.println("; diameter of PILE2: "+ diameter(alignments2));
@ -120,10 +131,68 @@ public class PileBuilder implements RecordPileReceiver {
d.set(n,m,distance(piles.get(n), piles.get(m)));
}
}
int new_mismatches = 0 ; // number of mismatches after re-alignment:
TreeSet< CountedObject<Indel> > new_indels = new TreeSet< CountedObject<Indel> >(); // new indels after realignment
int shifted_reads = 0;
List<SAMRecord> as_list = (List<SAMRecord>)c; // ugly hack; need this to access records by ids
System.out.println(d.format("%8.4g"));
for ( int n = 0 ; n < piles.size() ; n++ ) {
System.out.println("PILE " + n +"\n" +piles.get(n).toString());
// SWPairwiseAlignment consToRef = new SWPairwiseAlignment(pileRef,piles.get(n).getConsensus(),2.0,-10.0,-2.0,-1.0);
SWPairwiseAlignment consToRef = new SWPairwiseAlignment(pileRef,piles.get(n).getConsensus(),3.0,-1.0,-4,-0.5);
MultipleAlignment ma = piles.get(n);
for ( Integer id : ma ) {
SAMRecord r = as_list.get(id);
int cons_offset = ma.getOffsetWrtConsensus(id); // offset of the read 'id' wrt multiple alignment's full consensus seq
int ref_offset = cons_offset + startOnRef + consToRef.getAlignmentStart2wrt1();
if ( ref_offset != r.getAlignmentStart()) shifted_reads++;
Cigar cig = buildCigar(cons_offset, r.getReadLength(), consToRef.getCigar());
System.out.println(AlignmentUtils.toString(cig));
}
System.out.println("PILE " + n + " to REF ("+ (consToRef.getCigar().numCigarElements()-1)/2 +" indels):");
System.out.println(consToRef.toString());
System.out.println("PILE " + n +" (READS):\n" +piles.get(n).toString());
}
}
/** Assuming that a read of length l has a gapless, fully consumed align starting at s (ZERO-based) to some sequence X,
* and that sequence's alignment to some reference Y is described by baseCigar, builds a cigar for the direct
* alignment of the read to Y (i.e. if the alignment of X to Y contains indel(s) and the read spans them, the
* indels will be inserted into the new cigar for read-Y alignment).
* @param s
* @param l
* @param baseCigar
* @return
*/
private Cigar buildCigar(int s, int l, Cigar baseCigar) {
int readpos = 0;
int refpos = 0;
List<CigarElement> lce = new ArrayList<CigarElement>(5); // enough to keep 2 indels. should cover 99.999% of cases...
CigarElement celem = null;
int i = 0;
while ( refpos <= s ) {
celem = baseCigar.getCigarElement(i);
refpos+=celem.getLength();
i++;
}
// we now sit on cigar element that contains start s, and refpos points to the end of that element; i points to next element
lce.add( new CigarElement(Math.min(refpos-s,l),celem.getOperator()) );
while ( refpos < s+l ) {
celem = baseCigar.getCigarElement(i);
lce.add( new CigarElement(Math.min(celem.getLength(),l - refpos), celem.getOperator()) );
refpos += celem.getLength();
i++;
}
return new Cigar(lce);
}
public void initPairwiseAlignments( IndexedSequence [] seqs ) {
@ -308,7 +377,7 @@ public class PileBuilder implements RecordPileReceiver {
return d_av;
}
/** Computes average distance from sequence i to multiple alignment a for the specified offset of i wrt a
/** Computes average distance from sequence i to multiple alignment a for the specified offset of 'i' wrt 'a'
* and returns that distance and pair of sequence indices, on which the specified offset is realized
* @param a
* @param i

View File

@ -24,20 +24,33 @@ public class SWPairwiseAlignment {
private int alignment_offset; // offset of s2 w/respect to s1
private Cigar alignmentCigar;
private double w_match;
private double w_mismatch;
private double w_open;
private double w_extend;
private int best_mm; // mismatch count
private static final int IMPOSSIBLE = 1000000000;
private static final int MSTATE = 0;
private static final int ISTATE = 1;
private static final int DSTATE = 2;
public SWPairwiseAlignment(String seq1, String seq2, int id1, int id2 ) {
public SWPairwiseAlignment(String seq1, String seq2, int id1, int id2, double match, double mismatch, double open, double extend ) {
s1 = seq1;
s2 = seq2;
i1 = id1;
i2 = id2;
w_match = match;
w_mismatch = mismatch;
w_open = open;
w_extend = extend;
best_mm = IMPOSSIBLE;
//next_mm = IMPOSSIBLE;
align2(s1,s2);
align4(s1,s2);
}
public SWPairwiseAlignment(String seq1, String seq2, int id1, int id2) {
this(seq1,seq2,id1,id2,1.0,-1.0/3.0,-1.0-1.0/3.0,-1.0/3.0); // match=1, mismatch = -1/3, gap=-(1+k/3)
}
/** Initializes the alignment with pair of sequences (that will be immediately aligned) and
@ -48,6 +61,10 @@ public class SWPairwiseAlignment {
this(seq1,seq2,-1,-1);
}
public SWPairwiseAlignment(String seq1, String seq2, double match, double mismatch, double open, double extend) {
this(seq1,seq2,-1,-1,match,mismatch,open, extend);
}
public Cigar getCigar() { return alignmentCigar ; }
public int getAlignmentStart2wrt1() { return alignment_offset; }
@ -258,6 +275,8 @@ public class SWPairwiseAlignment {
}
/** Allows for separate gap opening and extension penalties, with backtracking.
*
* @param a
@ -405,6 +424,143 @@ public void align3(String a, String b) {
alignment_offset = p.first - p.second;
}
public void align4(String a, String b) {
int n = a.length();
int m = b.length();
double [][] sw = new double[n+1][m+1];
int [][] btrack = new int[n+1][m+1];
// build smith-waterman matrix and keep backtrack info:
for ( int i = 1 ; i < n+1 ; i++ ) {
char a_base = Character.toUpperCase(a.charAt(i-1)); // letter in a at the current pos
for ( int j = 1 ; j < m+1 ; j++ ) {
char b_base = Character.toUpperCase(b.charAt(j-1)); // letter in b at the current pos
double step_diag = sw[i-1][j-1] + wd(a_base,b_base);
double step_down = 0.0 ;
int kd = 0;
for ( int k = 1 ; k < i ; k++ ) {
if ( step_down < sw[i-k][j]+wk(a_base,'-',k) ) {
step_down=sw[i-k][j]+wk(a_base,'-',k);
kd = k;
}
}
double step_right = 0;
int ki = 0;
for ( int k = 1 ; k < j ; k++ ) {
if ( step_right < sw[i][j-k]+wk('-',b_base,k) ) {
step_right=sw[i][j-k]+wk('-',b_base,k);
ki = k;
}
}
if ( step_down > step_right ) {
if ( step_down > step_diag ) {
sw[i][j] = Math.max(0,step_down);
btrack[i][j] = kd; // positive=vertical
}
else {
sw[i][j] = Math.max(0,step_diag);
btrack[i][j] = 0; // 0 = diagonal
}
} else {
// step_down < step_right
if ( step_right > step_diag ) {
sw[i][j] = Math.max(0,step_right);
btrack[i][j] = -ki; // negative = horizontal
} else {
sw[i][j] = Math.max(0,step_diag);
btrack[i][j] = 0; // 0 = diagonal
}
}
sw[i][j] = Math.max(0, Math.max(step_diag,Math.max(step_down,step_right)));
}
}
// print(sw,a,b);
PrimitivePair.Int p = new PrimitivePair.Int();
double maxscore = 0.0;
// look for largest score. we use >= combined with the traversal direction
// to ensure that if two scores are equal, the one closer to diagonal gets picked
for ( int i = 1 ; i < n+1 ; i++ ) {
if ( sw[i][m] >= maxscore ) { p.first = i; p.second = m ; maxscore = sw[i][m]; }
}
for ( int j = 1 ; j < m+1 ; j++ ) {
if ( sw[n][j] > maxscore ||
sw[n][j] == maxscore && Math.abs(n-j) < Math.abs(p.first-p.second)) {
p.first = n;
p.second = j ;
maxscore = sw[n][j];
}
}
// System.out.println("\ni="+p.first+"; j="+p.second);
// p holds the position we start backtracking from; we will be assembling a cigar in the backwards order
// we will be placing all insertions and deletions into sequence b, so the state are named w/regard
// to that sequence
int state = MSTATE;
int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
double [] scores = new double[3];
List<CigarElement> lce = new ArrayList<CigarElement>(5);
do {
int btr = btrack[p.first][p.second];
int step_left = ( btr < 0 ? -btr : 1);
int step_up = ( btr > 0 ? btr : 1 );
int new_state = -1;
if ( btr > 0 ) new_state = DSTATE;
else if ( btr < 0 ) new_state = ISTATE;
else new_state = MSTATE;
int step_length = 1;
// move to next best location in the sw matrix:
switch( new_state ) {
case MSTATE: p.first--; p.second--; break;
case ISTATE: p.second-=step_left; step_length = step_left; break;
case DSTATE: p.first-=step_up; step_length = step_up; break;
}
// now let's see if the state actually changed:
if ( new_state == state ) segment_length+=step_length;
else {
// state changed, lets emit previous segment, whatever it was (Insertion Deletion, or (Mis)Match).
CigarOperator o=null;
switch(state) {
case MSTATE: o = CigarOperator.M; break;
case ISTATE: o = CigarOperator.I; break;
case DSTATE: o = CigarOperator.D; break;
}
CigarElement e = new CigarElement(segment_length,o);
lce.add(e);
segment_length = step_length;
state = new_state;
}
} while ( sw[p.first][p.second] != 0 );
// post-process the last segment we are still keeping
CigarOperator o=null;
switch(state) {
case MSTATE: o = CigarOperator.M; break;
case ISTATE: o = CigarOperator.I; break;
case DSTATE: o = CigarOperator.D; break;
}
CigarElement e = new CigarElement(segment_length,o);
lce.add(e);
Collections.reverse(lce);
alignmentCigar = new Cigar(lce);
alignment_offset = p.first - p.second;
}
private int w(char x, char y) {
if ( x == y ) return 2; // match
@ -413,12 +569,12 @@ public void align3(String a, String b) {
}
private double wd ( char x, char y ) {
if ( x== y ) return 2.0;
else return -1.0;
if ( x== y ) return w_match;
else return w_mismatch;
}
private double wk(char x, char y, int k) {
return -2.0-k/6; // gap
return w_open+(k-1)*w_extend; // gap
// return -1.0 ; // no extension penalty
// return -1.0-Math.log(k+1); // weak extension penalty
}
@ -507,6 +663,7 @@ public void align3(String a, String b) {
}
public String toString() {
StringBuilder bmm = new StringBuilder();
StringBuilder b1 = new StringBuilder();
StringBuilder b2 = new StringBuilder();
@ -516,12 +673,14 @@ public void align3(String a, String b) {
for ( ; pos2 < -alignment_offset ; pos2++ ) {
b1.append(' ');
b2.append(s2.charAt(pos2));
bmm.append(' ');
}
// now pos2 = -alignment_offset;
} else {
for ( ; pos1 < alignment_offset ; pos1++ ) {
b2.append(' ');
b1.append(s1.charAt(pos1));
bmm.append(' ');
}
// now pos1 = alignment_offset
}
@ -530,73 +689,85 @@ public void align3(String a, String b) {
CigarElement ce = getCigar().getCigarElement(i) ;
switch( ce.getOperator() ) {
case M:
int z = ce.getLength();
b1.append(s1, pos1, pos1+z);
b2.append(s2, pos2, pos2+z);
pos1+=z; pos2+=z;
for ( int k = 0 ; k < ce.getLength() ; k++ ) {
if ( Character.toUpperCase(s1.charAt(pos1)) != Character.toUpperCase(s2.charAt(pos2)) ) bmm.append('*');
else bmm.append(' ');
b1.append(s1.charAt(pos1++));
b2.append(s2.charAt(pos2++));
}
break;
case I:
for ( int k = 0 ; k < ce.getLength() ; k++ ) {
b1.append('+');
bmm.append('+');
b2.append(s2.charAt(pos2++));
}
break;
case D:
for ( int k = 0 ; k < ce.getLength() ; k++ ) {
b1.append(s1.charAt(pos1++));
bmm.append('-');
b2.append('-');
}
break;
}
}
bmm.append('\n');
b1.append(s1,pos1,s1.length());
bmm.append(b1);
bmm.append('\n');
b2.append(s2,pos2,s2.length());
b1.append('\n');
b1.append(b2);
b1.append('\n');
return b1.toString();
bmm.append(b2);
bmm.append('\n');
return bmm.toString();
}
public static void testMe() {
// String s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT";
// String s2 = "TGTATATAGGGTAAGG";
/* String s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT";
String s2 = "TGTATATAGGGTAAGG";
// String s1 = "GGTAAGGC";
// String s2 = "GGTCTCAA";
// String s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT";
// String s2 = "TGTTAGGGTCTCAAGG";
// String s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT";
// String s2 = "TAGGGTAAGGCTGATCCATGTACCG" ;
String s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT";
String s2 = "CCGTATCATTACCTGGTGTATATAGG";
// String s1 = "GGTGTATATAGGGT" ;
// String s2 = "TGTTAGGG";
testMe(s1,s2);
s1 = "GGTAAGGC";
s2 = "GGTCTCAA";
testMe(s1,s2);
s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT";
s2 = "TGTTAGGGTCTCAAGG";
testMe(s1,s2);
s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT";
s2 = "TAGGGTAAGGCTGATCCATGTACCG" ;
testMe(s1,s2);
s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT";
s2 = "CCGTATCATTACCTGGTGTATATAGG";
testMe(s1,s2);
s1 = "GGTGTATATAGGGT" ;
s2 = "TGTTAGGG";
testMe(s1,s2);
s1 = "AGACAGAGAGAAGG";
s2 = "AGACAGAGAAGG";
testMe(s1,s2);
*/
String s1 = "CCAGCACACAGGTATCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTGTTTTTTGA";
String s2 = "CCAGCACACATCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTGTTTTTTGA";
testMe(s1,s2);
}
public static void testMe(String s1, String s2) {
SWPairwiseAlignment swpa = new SWPairwiseAlignment(s1,s2);
SWPairwiseAlignment swpa = new SWPairwiseAlignment(s1,s2,3.0,-1.0,-4,-0.5);
for ( int i = 0 ; i < swpa.getCigar().numCigarElements() ; i++ ) {
char c=' ';
switch ( swpa.getCigar().getCigarElement(i).getOperator() ) {
case M : c = 'M'; break;
case D : c = 'D'; break;
case I : c = 'I'; break;
}
System.out.print(c);
System.out.print(swpa.getCigar().getCigarElement(i).getLength());
}
SequencePile sp = new SequencePile(s1);
sp.addAlignedSequence(s2,false,swpa.getCigar(),swpa.getAlignmentStart2wrt1());
System.out.println();
System.out.println(sp.format());
System.out.println(AlignmentUtils.toString(swpa.getCigar()));
// SequencePile sp = new SequencePile(s1);
// sp.addAlignedSequence(s2,false,swpa.getCigar(),swpa.getAlignmentStart2wrt1());
// System.out.println();
// System.out.println(sp.format());
System.out.println("--------\n"+swpa.toString());

View File

@ -6,13 +6,16 @@ import net.sf.samtools.*;
public class SequencePile {
private List<MSAColumn> mSeqGrid;
private StringBuffer mRefGrid;
private StringBuilder mRefGrid;
private StringBuilder headerGrid;
private int mDepth;
private List<Boolean> mSeqRC;
public SequencePile(String ref) {
mRefGrid = new StringBuffer( ref );
mRefGrid = new StringBuilder( ref );
headerGrid = new StringBuilder();
for ( int i = 0; i < ref.length(); i++ ) headerGrid.append(' ');
mSeqGrid = new ArrayList<MSAColumn>();
for ( int i = 0 ; i < mRefGrid.length(); i++ ) {
mSeqGrid.add(new MSAColumn());
@ -39,19 +42,21 @@ public class SequencePile {
// alignedSeq = ReverseComplement(seq);
// } else alignedSeq = seq;
mSeqRC.add(isRC);
int pos = 0; // actual position on the grid (reference can have insertions)
// will hold actual position on the grid; reference can have insertions on the grid,
// so position on the grid where we should start placing the read is not refpos!
int pos = 0;
for ( int i = 0 ; i < refpos ; i++ ) { // i is the position on the original reference
// if we got some insertions on the reference prior to refpos, we need to take care of them:
// if we got some insertions on the reference prior to refpos, we need to count them in:
while( mRefGrid.charAt(pos) == '+' ) {
mSeqGrid.get(pos).add(' ');
mSeqGrid.get(pos).add(' '); // add additional spaces in the line that will hold sequence seq
pos++;
}
mSeqGrid.get(pos).add(' '); // fill with ' ' to the left of the read
pos++;
}
// we reached start position of the alignment
// we reached start position of the alignment on the reference grid
int readpos = 0; // position on the read
@ -66,18 +71,17 @@ public class SequencePile {
if ( pos >= 0 ) {
if ( mRefGrid.charAt(pos) !='+' ) { // there was no insertion here yet: add it now!
mRefGrid.insert(pos, '+');
headerGrid.insert(pos,'+');
MSAColumn c = new MSAColumn();
// reads up to the previous depth (prior to adding current read) did not
// have an insertion here, so we insert '*' into all of them:
for ( int k = 0 ; k < mDepth ; k++ ) {
if ( mSeqGrid.get(pos-1).charAt(k) == ' ') c.add(' ');
else c.add('*');
}
mSeqGrid.add(pos, c);
}
try {
mSeqGrid.get(pos).add(alignedSeq.charAt(readpos));
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(e.getMessage()+": "+seq);
mSeqGrid.add(pos, c); // finally, add the base from the current read
}
mSeqGrid.get(pos).add(alignedSeq.charAt(readpos));
}
readpos++;
pos++;
@ -91,6 +95,7 @@ public class SequencePile {
}
if ( pos >= mRefGrid.length() ) break;
mSeqGrid.get(pos).add('-'); // mark deletion
headerGrid.setCharAt(pos,'-');
pos++;
}
break;
@ -102,11 +107,10 @@ public class SequencePile {
pos++;
}
if ( pos >= mRefGrid.length() ) break;
try {
mSeqGrid.get(pos).add(alignedSeq.charAt(readpos));
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(e.getMessage()+": "+seq);
}
mSeqGrid.get(pos).add(alignedSeq.charAt(readpos));
if ( Character.toUpperCase(alignedSeq.charAt(readpos)) !=
Character.toUpperCase(mRefGrid.charAt(pos))
&& headerGrid.charAt(pos)== ' ') headerGrid.setCharAt(pos,'*');
pos++;
readpos++;
}
@ -158,35 +162,114 @@ public class SequencePile {
public void colorprint() { colorprint(false); }
public void colorprint(boolean printId) {
if ( printId ) System.out.print(" ");
else System.out.print(" ");
System.out.println(mRefGrid);
public void dotprint(boolean printId) {
StringBuilder sb = new StringBuilder();
java.util.Formatter frmt = new java.util.Formatter(sb);
String skip = null;
if ( printId ) skip = new String(" ");
else skip = new String(" ");
System.out.print(formatHeader(skip));
System.out.print(skip);
System.out.println(mRefGrid);
try {
for ( int i = 0 ; i < mDepth; i++ ) {
if ( printId ) System.out.printf("%3d",i);
if ( mSeqRC.get(i).booleanValue() ) System.out.print("<-");
else System.out.print("->");
for ( int j = 0 ; j < mRefGrid.length() ; j++) {
char seqbase = mSeqGrid.get(j).charAt(i);
char refbase = mRefGrid.charAt(j);
if ( isBase(refbase) && isBase(seqbase) &&
Character.toUpperCase(refbase) ==
Character.toUpperCase(seqbase) ) {
if ( mSeqRC.get(i) ) System.out.print(',');
else System.out.print('.');
}
else System.out.print(seqbase);
}
System.out.print('\n');
}
} catch (Exception e) {}
}
public void colorprint(boolean printId) {
String skip = null;
if ( printId ) skip = new String(" ");
else skip = new String(" ");
System.out.print(formatHeader(skip));
System.out.print(skip);
System.out.println(mRefGrid);
try {
for ( int i = 0 ; i < mDepth; i++ ) {
if ( printId ) {
sb.delete(0,sb.length());
frmt.format("%3d:", i);
System.out.print(frmt.out().toString());
}
if ( printId ) System.out.printf("%3d",i);
if ( mSeqRC.get(i).booleanValue() ) System.out.print("<-");
else System.out.print("->");
for ( int j = 0 ; j < mRefGrid.length() ; j++) {
char seqbase = mSeqGrid.get(j).charAt(i);
char refbase = mRefGrid.charAt(j);
if ( isBase(refbase) && isBase(seqbase) && refbase != seqbase ) System.out.print("\033[31m"+seqbase+"\033[30m");
if ( isBase(refbase) && isBase(seqbase) &&
Character.toUpperCase(refbase) !=
Character.toUpperCase(seqbase) ) System.out.print("\033[31m"+seqbase+"\033[30m");
else System.out.print(seqbase);
}
System.out.print('\n');
}
} catch (Exception e) {}
}
private String formatHeader(String leadString) {
char [][] mm_strings = new char[2][mRefGrid.length()];
for ( int i = 0 ; i < mRefGrid.length() ; i++ ) {
int count = 0;
char refC = mRefGrid.charAt(i);
MSAColumn col = mSeqGrid.get(i);
if ( refC == '+' ) {
// count number of observations for insertion
for ( int j = 0 ; j < col.size() ; j++ ) {
if ( col.charAt(j) != '*' ) count++;
}
} else {
if ( headerGrid.charAt(i) == '-' ) {
// count number of observations for deletion
for ( int j = 0 ; j < col.size() ; j++ ) {
if ( col.charAt(j) == '-' ) count++;
}
} else {
if ( headerGrid.charAt(i) == '*') {
for ( int j = 0 ; j < col.size() ; j++ ) {
if ( col.charAt(j)!=' ' &&
Character.toUpperCase(col.charAt(j)) !=
Character.toUpperCase(refC) ) count++;
}
}
}
}
if ( count > 9 ) mm_strings[0][i] = Character.forDigit(count/10,10);
else mm_strings[0][i] = ' ';
if ( count > 0 ) mm_strings[1][i] = Character.forDigit(count%10,10);
else mm_strings[1][i] = ' ';
}
StringBuilder b = new StringBuilder();
b.append(leadString);
b.append(mm_strings[0]);
b.append('\n');
b.append(leadString);
b.append(mm_strings[1]);
b.append('\n');
b.append(leadString);
b.append(headerGrid);
b.append('\n');
return b.toString();
}
private boolean isBase(char b) {
b = Character.toUpperCase(b);
return ( b=='A' ||b == 'C' || b=='G' || b=='T' || b=='N');
}
}