From 40f45c233326c8041e505387220f245a02027a89 Mon Sep 17 00:00:00 2001 From: asivache Date: Wed, 25 Mar 2009 05:48:10 +0000 Subject: [PATCH] git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@184 348d0f76-0448-11de-a6fe-93d51630548a --- .../playground/indels/AlignmentUtils.java | 178 ++++++++++++ .../playground/indels/ConsensusSequence.java | 6 +- .../playground/indels/IndelInspector.java | 28 +- .../indels/IndelRecordPileCollector.java | 3 +- .../playground/indels/MultipleAlignment.java | 56 ++-- .../playground/indels/PairwiseAlignment.java | 22 +- .../sting/playground/indels/PileBuilder.java | 91 +++++- .../indels/SWPairwiseAlignment.java | 261 +++++++++++++++--- .../sting/playground/indels/SequencePile.java | 147 +++++++--- 9 files changed, 632 insertions(+), 160 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/playground/indels/AlignmentUtils.java diff --git a/java/src/org/broadinstitute/sting/playground/indels/AlignmentUtils.java b/java/src/org/broadinstitute/sting/playground/indels/AlignmentUtils.java new file mode 100644 index 000000000..5a72eec3c --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/indels/AlignmentUtils.java @@ -0,0 +1,178 @@ +package org.broadinstitute.sting.playground.indels; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import edu.mit.broad.picard.reference.ReferenceSequence; +import org.broadinstitute.sting.playground.utils.CountedObject; + +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: Mar 25, 2009 + * Time: 12:15:38 AM + * To change this template use File | Settings | File Templates. + */ +public class AlignmentUtils { + + /** Computes number of mismatches in the read alignment to the refence ref + * specified in the record r. Indels are completely ignored by this method: + * only base mismatches in the alignment segments where both sequences are present are counted. + * @param r + * @param ref + * @return + */ + public static int numMismatches(SAMRecord r, ReferenceSequence ref) { + if ( r.getReadUnmappedFlag() ) return 1000000; + int i_ref = r.getAlignmentStart()-1; // position on the ref + int i_read = 0; // position on the read + int mm_count = 0; // number of mismatches + Cigar c = r.getCigar(); + for ( int k = 0 ; k < c.numCigarElements() ; k++ ) { + CigarElement ce = c.getCigarElement(k); + switch( ce.getOperator() ) { + case M: + for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) { + if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) != 'N' ) continue; // do not count N's ? + if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) != + Character.toUpperCase((char)ref.getBases()[i_ref]) ) mm_count++; + } + break; + case I: i_read += ce.getLength(); break; + case D: i_ref += ce.getLength(); break; + default: throw new RuntimeException("Unrecognized cigar element"); + } + + } + return mm_count; + } + + public static int numMismatches(SAMRecord r, String ref) { + if ( r.getReadUnmappedFlag() ) return 1000000; + int i_ref = r.getAlignmentStart()-1; // position on the ref + int i_read = 0; // position on the read + int mm_count = 0; // number of mismatches + Cigar c = r.getCigar(); + for ( int k = 0 ; k < c.numCigarElements() ; k++ ) { + CigarElement ce = c.getCigarElement(k); + switch( ce.getOperator() ) { + case M: + for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) { + if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) == 'N' ) continue; // do not count N's ? + if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) != + Character.toUpperCase(ref.charAt(i_ref)) ) mm_count++; + } + break; + case I: i_read += ce.getLength(); break; + case D: i_ref += ce.getLength(); break; + default: throw new RuntimeException("Unrecognized cigar element"); + } + + } + return mm_count; + } + + /** Reads through the alignment cigar and returns all indels found in the alignment as a collection + * of Indel objects. + * @param c + * @param start + * @return + */ + public static Collection extractIndels(final Cigar c, final int start) { + // + // firstpos,lastpos span of the indel will be interpreted as follows: + // any alignment that ends strictly before firstpos or starts strictly after lastpos + // on the *reference* (not inclusive!) does not overlap with an indel; in the case of + // insertion it will result in firstpos > lastpos! + // lastpos + // | firstpos + // | | + // v v + // ---------III----- Ref Insertion: bases I are not in the ref; any alignment that starts + // after lastpos or ends before firstpos *on the reference* + // is completely over the reference bases to the right or to + // the left, respectively, of the insertion site + // + // firstpos + // | lastpos + // | | + // v v + //------------------ Ref Deletion: any alignment that ends before firstpos or starts after lastpos + // -----DDD--- alignment on the reference does not overlap with the deletion + int runninglength = start; // position on the original reference; start = alignment start position + + List indels = new ArrayList(4); + + if ( c.numCigarElements() <= 1 ) return indels; // most of the reads have no indels, save a few cycles by returning early + + for ( int i = 0 ; i < c.numCigarElements() ; i++ ) { + + final CigarElement ce = c.getCigarElement(i); + Indel curr_indel = null; + + switch(ce.getOperator()) { + case I: curr_indel = new Indel(runninglength, ce.getLength(), Indel.IndelType.I); break; + case D: curr_indel = new Indel(runninglength, ce.getLength(), Indel.IndelType.D); + runninglength += ce.getLength(); + break; + case M: runninglength += ce.getLength(); break; // advance along the gapless block in the alignment + default : + throw new IllegalArgumentException("Unexpected operator in cigar string"); + } + + if ( curr_indel == null ) continue; // element was not an indel, go grab next element... + + indels.add(curr_indel); // this is a new indel. Add it. + } // end for loop over all alignment cigar elements + + return indels; + } // end extractIndels() method + + /** Reads through the alignment specified in the record and returns all indels found in the alignment as a collection + * of Indel objects. If read is not mapped, silently returns an empty collection. + * @param r + * @return + */ + public static Collection extractIndels(SAMRecord r) { + if ( r.getReadUnmappedFlag() ) return new ArrayList(); + return extractIndels(r.getCigar(), r.getAlignmentStart()); + } + + /** Extracts indels from the specified record (@see #extractIndels()) and updates the provided tree object. + * Multiple occurences of the same indel (i.e. with support from multiple reads) will be grouped together + * in one counted object held by the set. + * + * @param r + * @param t + */ + public static void collectAndCountIndels(SAMRecord r, TreeSet > t) { + Collection indels = AlignmentUtils.extractIndels(r); + for ( Indel ind : indels ) { + CountedObject ci = new CountedObject(ind); + CountedObject found = t.floor(ci); + + if ( ci.equals( found ) ) found.increment(); // we did find our indel, advance the counter + else t.add(ci); // this is a new indel. Add it. + + } + + } + + public static String toString(Cigar cig) { + StringBuilder b = new StringBuilder(); + + for ( int i = 0 ; i < cig.numCigarElements() ; i++ ) { + char c='?'; + switch ( cig.getCigarElement(i).getOperator() ) { + case M : c = 'M'; break; + case D : c = 'D'; break; + case I : c = 'I'; break; + } + b.append(c); + b.append(cig.getCigarElement(i).getLength()); + } + return b.toString(); + } +} diff --git a/java/src/org/broadinstitute/sting/playground/indels/ConsensusSequence.java b/java/src/org/broadinstitute/sting/playground/indels/ConsensusSequence.java index 75ed67954..518d4bd71 100644 --- a/java/src/org/broadinstitute/sting/playground/indels/ConsensusSequence.java +++ b/java/src/org/broadinstitute/sting/playground/indels/ConsensusSequence.java @@ -132,7 +132,7 @@ public class ConsensusSequence { */ public char baseAt(int offset) { assert offset >= startOffset && offset < startOffset + coverage.size() : "Offset out of bounds"; - int [] cov = coverage.get(offset+(int)referencePos); + int [] cov = coverage.get(offset-startOffset); int total_cov = cov[0] + cov[1] + cov[2] + cov[3]; int bmax = 0; char base = 'N'; @@ -153,7 +153,7 @@ public class ConsensusSequence { */ public Pair baseWithCountAt(int offset) { assert offset >= startOffset && offset < startOffset + coverage.size() : "Offset out of bounds"; - int [] cov = coverage.get(offset+(int)referencePos); + int [] cov = coverage.get(offset-startOffset); int total_cov = cov[0] + cov[1] + cov[2] + cov[3]; int bmax = 0; char base = 'N'; @@ -174,7 +174,7 @@ public class ConsensusSequence { */ public int coverageAt(int offset) { if ( offset < startOffset || offset >= startOffset + coverage.size() ) return 0; - int [] cov = coverage.get(offset+(int)referencePos); + int [] cov = coverage.get(offset-startOffset); return cov[0]+cov[1]+cov[2]+cov[3]; } diff --git a/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java b/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java index 7116b4264..16d466213 100755 --- a/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java +++ b/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java @@ -181,33 +181,7 @@ public class IndelInspector extends CommandLineProgram { return errs; } - private static int numMismatchesDirect(SAMRecord r, ReferenceSequence ref) { - if ( r.getReadUnmappedFlag() ) return 1000000; - int i_ref = r.getAlignmentStart()-1; // position on the ref - int i_read = 0; // position on the read - int mm_count = 0; // number of mismatches - Cigar c = r.getCigar(); - for ( int k = 0 ; k < c.numCigarElements() ; k++ ) { - CigarElement ce = c.getCigarElement(k); - switch( ce.getOperator() ) { - case M: - for ( int l = 0 ; l < ce.getLength() ; l++ ) { - if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) != - Character.toUpperCase((char)ref.getBases()[i_ref]) ) mm_count++; - i_ref++; - i_read++; - } - break; - case I: i_read += ce.getLength(); break; - case D: i_ref += ce.getLength(); break; - default: throw new RuntimeException("Unrecognized cigar element"); - } - - } - return mm_count; - } - - /** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD */ + /** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD */ private static int numMismatches(SAMRecord r) throws RuntimeException { // NM currently stores the total number of mismatches in all blocks + 1 diff --git a/java/src/org/broadinstitute/sting/playground/indels/IndelRecordPileCollector.java b/java/src/org/broadinstitute/sting/playground/indels/IndelRecordPileCollector.java index 2caa4cc6a..37f0965a5 100755 --- a/java/src/org/broadinstitute/sting/playground/indels/IndelRecordPileCollector.java +++ b/java/src/org/broadinstitute/sting/playground/indels/IndelRecordPileCollector.java @@ -191,8 +191,7 @@ public class IndelRecordPileCollector implements RecordReceiver { * @param r * @throws RuntimeException */ - @Override - public void receive(final SAMRecord r) throws RuntimeException { + public void receive(final SAMRecord r) throws RuntimeException { if ( r.getReadUnmappedFlag() ) return; // read did not align, nothing to do diff --git a/java/src/org/broadinstitute/sting/playground/indels/MultipleAlignment.java b/java/src/org/broadinstitute/sting/playground/indels/MultipleAlignment.java index 6586d6301..94dbd6c33 100755 --- a/java/src/org/broadinstitute/sting/playground/indels/MultipleAlignment.java +++ b/java/src/org/broadinstitute/sting/playground/indels/MultipleAlignment.java @@ -151,8 +151,20 @@ public class MultipleAlignment implements Iterable { return alignment_offsets.get(index.get(id)); } + /** Returns external id of the read the offsets of this multiple alignment are based upon (i.e. all the offsets + * are specified wrt the base read). + * @return + */ + public int getBaseReadId() { return ext_ids.get(0); } - + /** Returns offset of the read specified by its external id wrt the start of the consensus sequence in this + * multiple alignment (consenus sequence is a major vote union of all the reads in this alignment). + * @param id + * @return + */ + public int getOffsetWrtConsensus(int id) { + return getOffsetById (id)- consensus.getStartOffset(); + } /** Returns true if the alignment already contains sequence with the specified id. * @@ -235,49 +247,16 @@ public class MultipleAlignment implements Iterable { if ( seqs.size() == 0 ) return b.toString(); -// int first_offset = 0; // offset of the first sequence wrt the start of the MSA final int first_offset = -consensus.getStartOffset(); -// int msa_length = 0; -// for ( int i = 0 ; i < seqs.size() ; i++ ) { -// if ( -alignment_offsets.get(i) > first_offset ) first_offset = -alignment_offsets.get(i); -// msa_length = Math.max( alignment_offsets.get(i)+seqs.get(i).length() , msa_length ); -// } -// msa_length += first_offset; final int msa_length = consensus.length(); -// int [] cov = new int[4]; -// char[] bases = { 'A' , 'C', 'G', 'T' }; char[][] consensusString = new char[4][msa_length]; - for ( int i = -first_offset ; i < msa_length - first_offset ; i++ ) { -// cov[0] = cov[1] = cov[2] = cov[3] = 0; -// for ( int j = 0 ; j < seqs.size(); j++ ) { -// // offset of the sequence j wrt start of the msa region -// int seq_offset = first_offset + alignment_offsets.get(j); -// if ( i < seq_offset || i >= seq_offset + seqs.get(j).length() ) continue; // sequence j has no bases at position i -// int base = -1; -// switch( Character.toUpperCase(seqs.get(j).charAt(i-seq_offset)) ) { -// case 'A': base = 0; break; -// case 'C': base = 1 ; break; -// case 'G': base = 2 ; break; -// case 'T': base = 3 ; break; -// } -// if ( base >= 0 ) cov[base]++; -// } -// int total_cov = cov[0] + cov[1] + cov[2] + cov[3]; -// int bmax = 0; -// int mm = 0; + for ( int i = 0 ; i < msa_length ; i++ ) { Pair base = consensus.baseWithCountAt(i-first_offset); consensusString[3][i] = base.first; -// for ( int z = 0; z < 4 ; z++ ) { -// if ( cov[z] > bmax ) { -// bmax = cov[z]; -// consensus[3][i] = bases[z]; -// mm = total_cov - bmax; -// } -// } - int mm = consensus.coverageAt(i) - base.second; + int mm = consensus.coverageAt(i-first_offset) - base.second; if ( mm > 0 ) { consensusString[2][i] = '*'; if ( mm > 9 ) consensusString[0][i] = Character.forDigit(mm/10,10); @@ -307,6 +286,10 @@ public class MultipleAlignment implements Iterable { return b.toString(); } + public String getConsensus() { + return consensus.getSequence(); + } + public String toString() { return toString(true); } public int size() { return seqs.size(); } @@ -338,7 +321,6 @@ public class MultipleAlignment implements Iterable { } - @Override public Iterator iterator() { return sequenceIdIterator(); } diff --git a/java/src/org/broadinstitute/sting/playground/indels/PairwiseAlignment.java b/java/src/org/broadinstitute/sting/playground/indels/PairwiseAlignment.java index 153d10dd0..23bc37005 100755 --- a/java/src/org/broadinstitute/sting/playground/indels/PairwiseAlignment.java +++ b/java/src/org/broadinstitute/sting/playground/indels/PairwiseAlignment.java @@ -128,10 +128,26 @@ public class PairwiseAlignment { // lets extend these kmer matches and count mismatches: for ( Integer trial_offset : offsets ) { - int mm_cnt = countMismatches(is1.getSequence(), is2.getSequence(), trial_offset,next_mm); - if ( mm_cnt < best_mm ) { + int mm_cnt = countMismatches(is1.getSequence(), is2.getSequence(), trial_offset,next_mm+1); +// if ( (i1==4||i1==8) && i2==18) { +// if ( i1== 18 ) System.out.print("to " + i2+" : "); +// else System.out.print("to " + i1+" : "); +// System.out.println("offset="+trial_offset.toString()+ +// "; mm=" + countMismatches(is1.getSequence(),is2.getSequence(),trial_offset)+ +// "(mm_cnt="+mm_cnt+")"+ +// "; dist="+distance(is1.getSequence(),is2.getSequence(),trial_offset)+ +// "; overlap="+getOverlap(is1.getSequence(),is2.getSequence(),trial_offset)); +// } + // save current offset if alignment at this offset has fewer mismatches tham everything we've + // seen so far, or if it has same number of mismatches but has larger overlap (i.e. distance + // between sequences is smaller) + if ( mm_cnt < best_mm || + ( ( mm_cnt == best_mm ) && + getOverlap(is1.getSequence(),is2.getSequence(),alignment_offset) < + 0.8*getOverlap(is1.getSequence(),is2.getSequence(),trial_offset) ) ) { +// if ( (i1==4||i1==8) && i2==18) System.out.println("Saved offset "+trial_offset.toString()); alignment_offset = trial_offset; - next_mm = best_mm; + next_mm = best_mm; best_mm = mm_cnt; } else { if ( mm_cnt < next_mm ) next_mm = mm_cnt; diff --git a/java/src/org/broadinstitute/sting/playground/indels/PileBuilder.java b/java/src/org/broadinstitute/sting/playground/indels/PileBuilder.java index 4b4712a37..4a071d506 100755 --- a/java/src/org/broadinstitute/sting/playground/indels/PileBuilder.java +++ b/java/src/org/broadinstitute/sting/playground/indels/PileBuilder.java @@ -1,9 +1,13 @@ package org.broadinstitute.sting.playground.indels; import net.sf.samtools.SAMRecord; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.TextCigarCodec; import java.util.*; import org.broadinstitute.sting.utils.PrimitivePair; +import org.broadinstitute.sting.playground.utils.CountedObject; public class PileBuilder implements RecordPileReceiver { @@ -84,28 +88,35 @@ public class PileBuilder implements RecordPileReceiver { public void receive(Collection c) { IndexedSequence[] seqs = new IndexedSequence[c.size()]; int i = 0; - int left = 1000000000; - int right = 0; + int startOnRef = 1000000000; // absolute start (leftmost) position of the pile of reads on the ref + int stopOnRef = 0; // absolute stop (rightmost) position of the pile of reads on the ref (rightmost alignment end) for ( SAMRecord r : c ) { seqs[i++] = new IndexedSequence(r.getReadString(),KmerSize); - left = Math.min(left, r.getAlignmentStart() ); - right = Math.max(right,r.getAlignmentEnd()); + startOnRef = Math.min(startOnRef, r.getAlignmentStart() ); + stopOnRef = Math.max(stopOnRef,r.getAlignmentEnd()); } - SequencePile originalAligns = new SequencePile(referenceSequence.substring(left-1,right)); + // part of the reference covered by original alignments: + String pileRef = referenceSequence.substring(startOnRef-1,stopOnRef); + + int totalMismatches = 0; // total mismatches across all reads + TreeSet< CountedObject > all_indels = new TreeSet< CountedObject >(); + + SequencePile originalAligns = new SequencePile(pileRef); for ( SAMRecord r : c ) { originalAligns.addAlignedSequence(r.getReadString(), r.getReadNegativeStrandFlag(), - r.getCigar(), r.getAlignmentStart() - left ); + r.getCigar(), r.getAlignmentStart() - startOnRef ); + totalMismatches += AlignmentUtils.numMismatches(r,referenceSequence); + AlignmentUtils.collectAndCountIndels(r,all_indels); } + System.out.println("\n#############################################################################"); System.out.println("ORIGINAL ALIGNMENT: \n"); - originalAligns.colorprint(true); + originalAligns.dotprint(true); System.out.println("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") ; List piles = doMultipleAlignment2(seqs); - - // System.out.print("Distance between final piles: "+distance(alignments1, alignments2)); // System.out.print("; diameter of PILE1: "+ diameter(alignments1)); // System.out.println("; diameter of PILE2: "+ diameter(alignments2)); @@ -120,10 +131,68 @@ public class PileBuilder implements RecordPileReceiver { d.set(n,m,distance(piles.get(n), piles.get(m))); } } + + int new_mismatches = 0 ; // number of mismatches after re-alignment: + TreeSet< CountedObject > new_indels = new TreeSet< CountedObject >(); // new indels after realignment + int shifted_reads = 0; + + List as_list = (List)c; // ugly hack; need this to access records by ids + System.out.println(d.format("%8.4g")); for ( int n = 0 ; n < piles.size() ; n++ ) { - System.out.println("PILE " + n +"\n" +piles.get(n).toString()); +// SWPairwiseAlignment consToRef = new SWPairwiseAlignment(pileRef,piles.get(n).getConsensus(),2.0,-10.0,-2.0,-1.0); + SWPairwiseAlignment consToRef = new SWPairwiseAlignment(pileRef,piles.get(n).getConsensus(),3.0,-1.0,-4,-0.5); + + MultipleAlignment ma = piles.get(n); + for ( Integer id : ma ) { + SAMRecord r = as_list.get(id); + int cons_offset = ma.getOffsetWrtConsensus(id); // offset of the read 'id' wrt multiple alignment's full consensus seq + int ref_offset = cons_offset + startOnRef + consToRef.getAlignmentStart2wrt1(); + if ( ref_offset != r.getAlignmentStart()) shifted_reads++; + Cigar cig = buildCigar(cons_offset, r.getReadLength(), consToRef.getCigar()); + System.out.println(AlignmentUtils.toString(cig)); + } + + System.out.println("PILE " + n + " to REF ("+ (consToRef.getCigar().numCigarElements()-1)/2 +" indels):"); + System.out.println(consToRef.toString()); + System.out.println("PILE " + n +" (READS):\n" +piles.get(n).toString()); } + + } + + /** Assuming that a read of length l has a gapless, fully consumed align starting at s (ZERO-based) to some sequence X, + * and that sequence's alignment to some reference Y is described by baseCigar, builds a cigar for the direct + * alignment of the read to Y (i.e. if the alignment of X to Y contains indel(s) and the read spans them, the + * indels will be inserted into the new cigar for read-Y alignment). + * @param s + * @param l + * @param baseCigar + * @return + */ + private Cigar buildCigar(int s, int l, Cigar baseCigar) { + int readpos = 0; + int refpos = 0; + + List lce = new ArrayList(5); // enough to keep 2 indels. should cover 99.999% of cases... + + CigarElement celem = null; + int i = 0; + while ( refpos <= s ) { + celem = baseCigar.getCigarElement(i); + refpos+=celem.getLength(); + i++; + } + // we now sit on cigar element that contains start s, and refpos points to the end of that element; i points to next element + + lce.add( new CigarElement(Math.min(refpos-s,l),celem.getOperator()) ); + + while ( refpos < s+l ) { + celem = baseCigar.getCigarElement(i); + lce.add( new CigarElement(Math.min(celem.getLength(),l - refpos), celem.getOperator()) ); + refpos += celem.getLength(); + i++; + } + return new Cigar(lce); } public void initPairwiseAlignments( IndexedSequence [] seqs ) { @@ -308,7 +377,7 @@ public class PileBuilder implements RecordPileReceiver { return d_av; } - /** Computes average distance from sequence i to multiple alignment a for the specified offset of i wrt a + /** Computes average distance from sequence i to multiple alignment a for the specified offset of 'i' wrt 'a' * and returns that distance and pair of sequence indices, on which the specified offset is realized * @param a * @param i diff --git a/java/src/org/broadinstitute/sting/playground/indels/SWPairwiseAlignment.java b/java/src/org/broadinstitute/sting/playground/indels/SWPairwiseAlignment.java index 76604016c..16079c8e5 100755 --- a/java/src/org/broadinstitute/sting/playground/indels/SWPairwiseAlignment.java +++ b/java/src/org/broadinstitute/sting/playground/indels/SWPairwiseAlignment.java @@ -24,20 +24,33 @@ public class SWPairwiseAlignment { private int alignment_offset; // offset of s2 w/respect to s1 private Cigar alignmentCigar; + private double w_match; + private double w_mismatch; + private double w_open; + private double w_extend; + private int best_mm; // mismatch count private static final int IMPOSSIBLE = 1000000000; private static final int MSTATE = 0; private static final int ISTATE = 1; private static final int DSTATE = 2; - public SWPairwiseAlignment(String seq1, String seq2, int id1, int id2 ) { + public SWPairwiseAlignment(String seq1, String seq2, int id1, int id2, double match, double mismatch, double open, double extend ) { s1 = seq1; s2 = seq2; i1 = id1; i2 = id2; + w_match = match; + w_mismatch = mismatch; + w_open = open; + w_extend = extend; best_mm = IMPOSSIBLE; //next_mm = IMPOSSIBLE; - align2(s1,s2); + align4(s1,s2); + } + + public SWPairwiseAlignment(String seq1, String seq2, int id1, int id2) { + this(seq1,seq2,id1,id2,1.0,-1.0/3.0,-1.0-1.0/3.0,-1.0/3.0); // match=1, mismatch = -1/3, gap=-(1+k/3) } /** Initializes the alignment with pair of sequences (that will be immediately aligned) and @@ -48,6 +61,10 @@ public class SWPairwiseAlignment { this(seq1,seq2,-1,-1); } + public SWPairwiseAlignment(String seq1, String seq2, double match, double mismatch, double open, double extend) { + this(seq1,seq2,-1,-1,match,mismatch,open, extend); + } + public Cigar getCigar() { return alignmentCigar ; } public int getAlignmentStart2wrt1() { return alignment_offset; } @@ -258,6 +275,8 @@ public class SWPairwiseAlignment { } + + /** Allows for separate gap opening and extension penalties, with backtracking. * * @param a @@ -405,6 +424,143 @@ public void align3(String a, String b) { alignment_offset = p.first - p.second; } + public void align4(String a, String b) { + int n = a.length(); + int m = b.length(); + double [][] sw = new double[n+1][m+1]; + + int [][] btrack = new int[n+1][m+1]; + + // build smith-waterman matrix and keep backtrack info: + for ( int i = 1 ; i < n+1 ; i++ ) { + char a_base = Character.toUpperCase(a.charAt(i-1)); // letter in a at the current pos + for ( int j = 1 ; j < m+1 ; j++ ) { + char b_base = Character.toUpperCase(b.charAt(j-1)); // letter in b at the current pos + double step_diag = sw[i-1][j-1] + wd(a_base,b_base); + double step_down = 0.0 ; + int kd = 0; + for ( int k = 1 ; k < i ; k++ ) { + if ( step_down < sw[i-k][j]+wk(a_base,'-',k) ) { + step_down=sw[i-k][j]+wk(a_base,'-',k); + kd = k; + } + } + + double step_right = 0; + int ki = 0; + for ( int k = 1 ; k < j ; k++ ) { + if ( step_right < sw[i][j-k]+wk('-',b_base,k) ) { + step_right=sw[i][j-k]+wk('-',b_base,k); + ki = k; + } + } + + if ( step_down > step_right ) { + if ( step_down > step_diag ) { + sw[i][j] = Math.max(0,step_down); + btrack[i][j] = kd; // positive=vertical + } + else { + sw[i][j] = Math.max(0,step_diag); + btrack[i][j] = 0; // 0 = diagonal + } + } else { + // step_down < step_right + if ( step_right > step_diag ) { + sw[i][j] = Math.max(0,step_right); + btrack[i][j] = -ki; // negative = horizontal + } else { + sw[i][j] = Math.max(0,step_diag); + btrack[i][j] = 0; // 0 = diagonal + } + } + sw[i][j] = Math.max(0, Math.max(step_diag,Math.max(step_down,step_right))); + } + } + +// print(sw,a,b); + + PrimitivePair.Int p = new PrimitivePair.Int(); + double maxscore = 0.0; + // look for largest score. we use >= combined with the traversal direction + // to ensure that if two scores are equal, the one closer to diagonal gets picked + for ( int i = 1 ; i < n+1 ; i++ ) { + if ( sw[i][m] >= maxscore ) { p.first = i; p.second = m ; maxscore = sw[i][m]; } + } + for ( int j = 1 ; j < m+1 ; j++ ) { + if ( sw[n][j] > maxscore || + sw[n][j] == maxscore && Math.abs(n-j) < Math.abs(p.first-p.second)) { + p.first = n; + p.second = j ; + maxscore = sw[n][j]; + } + } + + // System.out.println("\ni="+p.first+"; j="+p.second); + + // p holds the position we start backtracking from; we will be assembling a cigar in the backwards order + + + // we will be placing all insertions and deletions into sequence b, so the state are named w/regard + // to that sequence + + int state = MSTATE; + int segment_length = 0; // length of the segment (continuous matches, insertions or deletions) + + double [] scores = new double[3]; + + List lce = new ArrayList(5); + + do { + + int btr = btrack[p.first][p.second]; + int step_left = ( btr < 0 ? -btr : 1); + int step_up = ( btr > 0 ? btr : 1 ); + + int new_state = -1; + if ( btr > 0 ) new_state = DSTATE; + else if ( btr < 0 ) new_state = ISTATE; + else new_state = MSTATE; + + int step_length = 1; + + // move to next best location in the sw matrix: + switch( new_state ) { + case MSTATE: p.first--; p.second--; break; + case ISTATE: p.second-=step_left; step_length = step_left; break; + case DSTATE: p.first-=step_up; step_length = step_up; break; + } + + // now let's see if the state actually changed: + if ( new_state == state ) segment_length+=step_length; + else { + // state changed, lets emit previous segment, whatever it was (Insertion Deletion, or (Mis)Match). + CigarOperator o=null; + switch(state) { + case MSTATE: o = CigarOperator.M; break; + case ISTATE: o = CigarOperator.I; break; + case DSTATE: o = CigarOperator.D; break; + } + CigarElement e = new CigarElement(segment_length,o); + lce.add(e); + segment_length = step_length; + state = new_state; + } + } while ( sw[p.first][p.second] != 0 ); + + // post-process the last segment we are still keeping + CigarOperator o=null; + switch(state) { + case MSTATE: o = CigarOperator.M; break; + case ISTATE: o = CigarOperator.I; break; + case DSTATE: o = CigarOperator.D; break; + } + CigarElement e = new CigarElement(segment_length,o); + lce.add(e); + Collections.reverse(lce); + alignmentCigar = new Cigar(lce); + alignment_offset = p.first - p.second; + } private int w(char x, char y) { if ( x == y ) return 2; // match @@ -413,12 +569,12 @@ public void align3(String a, String b) { } private double wd ( char x, char y ) { - if ( x== y ) return 2.0; - else return -1.0; + if ( x== y ) return w_match; + else return w_mismatch; } private double wk(char x, char y, int k) { - return -2.0-k/6; // gap + return w_open+(k-1)*w_extend; // gap // return -1.0 ; // no extension penalty // return -1.0-Math.log(k+1); // weak extension penalty } @@ -507,6 +663,7 @@ public void align3(String a, String b) { } public String toString() { + StringBuilder bmm = new StringBuilder(); StringBuilder b1 = new StringBuilder(); StringBuilder b2 = new StringBuilder(); @@ -516,12 +673,14 @@ public void align3(String a, String b) { for ( ; pos2 < -alignment_offset ; pos2++ ) { b1.append(' '); b2.append(s2.charAt(pos2)); + bmm.append(' '); } // now pos2 = -alignment_offset; } else { for ( ; pos1 < alignment_offset ; pos1++ ) { b2.append(' '); b1.append(s1.charAt(pos1)); + bmm.append(' '); } // now pos1 = alignment_offset } @@ -530,73 +689,85 @@ public void align3(String a, String b) { CigarElement ce = getCigar().getCigarElement(i) ; switch( ce.getOperator() ) { case M: - int z = ce.getLength(); - b1.append(s1, pos1, pos1+z); - b2.append(s2, pos2, pos2+z); - pos1+=z; pos2+=z; + for ( int k = 0 ; k < ce.getLength() ; k++ ) { + if ( Character.toUpperCase(s1.charAt(pos1)) != Character.toUpperCase(s2.charAt(pos2)) ) bmm.append('*'); + else bmm.append(' '); + b1.append(s1.charAt(pos1++)); + b2.append(s2.charAt(pos2++)); + } break; case I: for ( int k = 0 ; k < ce.getLength() ; k++ ) { b1.append('+'); + bmm.append('+'); b2.append(s2.charAt(pos2++)); } break; case D: for ( int k = 0 ; k < ce.getLength() ; k++ ) { b1.append(s1.charAt(pos1++)); + bmm.append('-'); b2.append('-'); } break; } } + bmm.append('\n'); b1.append(s1,pos1,s1.length()); + bmm.append(b1); + bmm.append('\n'); b2.append(s2,pos2,s2.length()); - b1.append('\n'); - b1.append(b2); - b1.append('\n'); - return b1.toString(); + bmm.append(b2); + bmm.append('\n'); + return bmm.toString(); } public static void testMe() { -// String s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT"; -// String s2 = "TGTATATAGGGTAAGG"; + /* String s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT"; + String s2 = "TGTATATAGGGTAAGG"; -// String s1 = "GGTAAGGC"; -// String s2 = "GGTCTCAA"; - -// String s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT"; -// String s2 = "TGTTAGGGTCTCAAGG"; - -// String s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT"; -// String s2 = "TAGGGTAAGGCTGATCCATGTACCG" ; - - String s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT"; - String s2 = "CCGTATCATTACCTGGTGTATATAGG"; - -// String s1 = "GGTGTATATAGGGT" ; -// String s2 = "TGTTAGGG"; testMe(s1,s2); + + s1 = "GGTAAGGC"; + s2 = "GGTCTCAA"; + + testMe(s1,s2); + + s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT"; + s2 = "TGTTAGGGTCTCAAGG"; + testMe(s1,s2); + + + s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT"; + s2 = "TAGGGTAAGGCTGATCCATGTACCG" ; + testMe(s1,s2); + + s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT"; + s2 = "CCGTATCATTACCTGGTGTATATAGG"; + testMe(s1,s2); + + s1 = "GGTGTATATAGGGT" ; + s2 = "TGTTAGGG"; + testMe(s1,s2); + + s1 = "AGACAGAGAGAAGG"; + s2 = "AGACAGAGAAGG"; + testMe(s1,s2); + */ + String s1 = "CCAGCACACAGGTATCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTGTTTTTTGA"; + String s2 = "CCAGCACACATCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTGTTTTTTGA"; + testMe(s1,s2); } public static void testMe(String s1, String s2) { - SWPairwiseAlignment swpa = new SWPairwiseAlignment(s1,s2); + SWPairwiseAlignment swpa = new SWPairwiseAlignment(s1,s2,3.0,-1.0,-4,-0.5); - - for ( int i = 0 ; i < swpa.getCigar().numCigarElements() ; i++ ) { - char c=' '; - switch ( swpa.getCigar().getCigarElement(i).getOperator() ) { - case M : c = 'M'; break; - case D : c = 'D'; break; - case I : c = 'I'; break; - } - System.out.print(c); - System.out.print(swpa.getCigar().getCigarElement(i).getLength()); - } - SequencePile sp = new SequencePile(s1); - sp.addAlignedSequence(s2,false,swpa.getCigar(),swpa.getAlignmentStart2wrt1()); - System.out.println(); - System.out.println(sp.format()); + System.out.println(AlignmentUtils.toString(swpa.getCigar())); +// SequencePile sp = new SequencePile(s1); +// sp.addAlignedSequence(s2,false,swpa.getCigar(),swpa.getAlignmentStart2wrt1()); +// System.out.println(); +// System.out.println(sp.format()); System.out.println("--------\n"+swpa.toString()); diff --git a/java/src/org/broadinstitute/sting/playground/indels/SequencePile.java b/java/src/org/broadinstitute/sting/playground/indels/SequencePile.java index 38a94ce4e..a5984dbe8 100755 --- a/java/src/org/broadinstitute/sting/playground/indels/SequencePile.java +++ b/java/src/org/broadinstitute/sting/playground/indels/SequencePile.java @@ -6,13 +6,16 @@ import net.sf.samtools.*; public class SequencePile { private List mSeqGrid; - private StringBuffer mRefGrid; + private StringBuilder mRefGrid; + private StringBuilder headerGrid; private int mDepth; private List mSeqRC; public SequencePile(String ref) { - mRefGrid = new StringBuffer( ref ); + mRefGrid = new StringBuilder( ref ); + headerGrid = new StringBuilder(); + for ( int i = 0; i < ref.length(); i++ ) headerGrid.append(' '); mSeqGrid = new ArrayList(); for ( int i = 0 ; i < mRefGrid.length(); i++ ) { mSeqGrid.add(new MSAColumn()); @@ -39,19 +42,21 @@ public class SequencePile { // alignedSeq = ReverseComplement(seq); // } else alignedSeq = seq; mSeqRC.add(isRC); - - int pos = 0; // actual position on the grid (reference can have insertions) + + // will hold actual position on the grid; reference can have insertions on the grid, + // so position on the grid where we should start placing the read is not refpos! + int pos = 0; for ( int i = 0 ; i < refpos ; i++ ) { // i is the position on the original reference - // if we got some insertions on the reference prior to refpos, we need to take care of them: + // if we got some insertions on the reference prior to refpos, we need to count them in: while( mRefGrid.charAt(pos) == '+' ) { - mSeqGrid.get(pos).add(' '); + mSeqGrid.get(pos).add(' '); // add additional spaces in the line that will hold sequence seq pos++; } mSeqGrid.get(pos).add(' '); // fill with ' ' to the left of the read pos++; } - // we reached start position of the alignment + // we reached start position of the alignment on the reference grid int readpos = 0; // position on the read @@ -66,18 +71,17 @@ public class SequencePile { if ( pos >= 0 ) { if ( mRefGrid.charAt(pos) !='+' ) { // there was no insertion here yet: add it now! mRefGrid.insert(pos, '+'); + headerGrid.insert(pos,'+'); MSAColumn c = new MSAColumn(); + // reads up to the previous depth (prior to adding current read) did not + // have an insertion here, so we insert '*' into all of them: for ( int k = 0 ; k < mDepth ; k++ ) { if ( mSeqGrid.get(pos-1).charAt(k) == ' ') c.add(' '); else c.add('*'); } - mSeqGrid.add(pos, c); - } - try { - mSeqGrid.get(pos).add(alignedSeq.charAt(readpos)); - } catch (IllegalArgumentException e) { - throw new IllegalArgumentException(e.getMessage()+": "+seq); + mSeqGrid.add(pos, c); // finally, add the base from the current read } + mSeqGrid.get(pos).add(alignedSeq.charAt(readpos)); } readpos++; pos++; @@ -91,6 +95,7 @@ public class SequencePile { } if ( pos >= mRefGrid.length() ) break; mSeqGrid.get(pos).add('-'); // mark deletion + headerGrid.setCharAt(pos,'-'); pos++; } break; @@ -102,11 +107,10 @@ public class SequencePile { pos++; } if ( pos >= mRefGrid.length() ) break; - try { - mSeqGrid.get(pos).add(alignedSeq.charAt(readpos)); - } catch (IllegalArgumentException e) { - throw new IllegalArgumentException(e.getMessage()+": "+seq); - } + mSeqGrid.get(pos).add(alignedSeq.charAt(readpos)); + if ( Character.toUpperCase(alignedSeq.charAt(readpos)) != + Character.toUpperCase(mRefGrid.charAt(pos)) + && headerGrid.charAt(pos)== ' ') headerGrid.setCharAt(pos,'*'); pos++; readpos++; } @@ -158,35 +162,114 @@ public class SequencePile { public void colorprint() { colorprint(false); } - public void colorprint(boolean printId) { - if ( printId ) System.out.print(" "); - else System.out.print(" "); - System.out.println(mRefGrid); + public void dotprint(boolean printId) { - StringBuilder sb = new StringBuilder(); - java.util.Formatter frmt = new java.util.Formatter(sb); - + String skip = null; + if ( printId ) skip = new String(" "); + else skip = new String(" "); + + System.out.print(formatHeader(skip)); + System.out.print(skip); + System.out.println(mRefGrid); + + try { + for ( int i = 0 ; i < mDepth; i++ ) { + if ( printId ) System.out.printf("%3d",i); + if ( mSeqRC.get(i).booleanValue() ) System.out.print("<-"); + else System.out.print("->"); + for ( int j = 0 ; j < mRefGrid.length() ; j++) { + char seqbase = mSeqGrid.get(j).charAt(i); + char refbase = mRefGrid.charAt(j); + if ( isBase(refbase) && isBase(seqbase) && + Character.toUpperCase(refbase) == + Character.toUpperCase(seqbase) ) { + if ( mSeqRC.get(i) ) System.out.print(','); + else System.out.print('.'); + } + else System.out.print(seqbase); + } + System.out.print('\n'); + } + } catch (Exception e) {} + } + + + public void colorprint(boolean printId) { + + String skip = null; + if ( printId ) skip = new String(" "); + else skip = new String(" "); + + System.out.print(formatHeader(skip)); + System.out.print(skip); + System.out.println(mRefGrid); + try { for ( int i = 0 ; i < mDepth; i++ ) { - if ( printId ) { - sb.delete(0,sb.length()); - frmt.format("%3d:", i); - System.out.print(frmt.out().toString()); - } + if ( printId ) System.out.printf("%3d",i); if ( mSeqRC.get(i).booleanValue() ) System.out.print("<-"); else System.out.print("->"); for ( int j = 0 ; j < mRefGrid.length() ; j++) { char seqbase = mSeqGrid.get(j).charAt(i); char refbase = mRefGrid.charAt(j); - if ( isBase(refbase) && isBase(seqbase) && refbase != seqbase ) System.out.print("\033[31m"+seqbase+"\033[30m"); + if ( isBase(refbase) && isBase(seqbase) && + Character.toUpperCase(refbase) != + Character.toUpperCase(seqbase) ) System.out.print("\033[31m"+seqbase+"\033[30m"); else System.out.print(seqbase); } System.out.print('\n'); } } catch (Exception e) {} } - + + private String formatHeader(String leadString) { + char [][] mm_strings = new char[2][mRefGrid.length()]; + for ( int i = 0 ; i < mRefGrid.length() ; i++ ) { + int count = 0; + char refC = mRefGrid.charAt(i); + MSAColumn col = mSeqGrid.get(i); + if ( refC == '+' ) { + // count number of observations for insertion + for ( int j = 0 ; j < col.size() ; j++ ) { + if ( col.charAt(j) != '*' ) count++; + } + } else { + if ( headerGrid.charAt(i) == '-' ) { + // count number of observations for deletion + for ( int j = 0 ; j < col.size() ; j++ ) { + if ( col.charAt(j) == '-' ) count++; + } + } else { + if ( headerGrid.charAt(i) == '*') { + for ( int j = 0 ; j < col.size() ; j++ ) { + if ( col.charAt(j)!=' ' && + Character.toUpperCase(col.charAt(j)) != + Character.toUpperCase(refC) ) count++; + } + } + } + } + if ( count > 9 ) mm_strings[0][i] = Character.forDigit(count/10,10); + else mm_strings[0][i] = ' '; + if ( count > 0 ) mm_strings[1][i] = Character.forDigit(count%10,10); + else mm_strings[1][i] = ' '; + } + + StringBuilder b = new StringBuilder(); + b.append(leadString); + b.append(mm_strings[0]); + b.append('\n'); + b.append(leadString); + b.append(mm_strings[1]); + b.append('\n'); + b.append(leadString); + b.append(headerGrid); + b.append('\n'); + return b.toString(); + } + private boolean isBase(char b) { + b = Character.toUpperCase(b); return ( b=='A' ||b == 'C' || b=='G' || b=='T' || b=='N'); } }