From 3d1e0bf079cf5ece2fd13cd00cce43d8a97a3bcd Mon Sep 17 00:00:00 2001 From: asivache Date: Tue, 24 Mar 2009 14:06:24 +0000 Subject: [PATCH] git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@166 348d0f76-0448-11de-a6fe-93d51630548a --- .../playground/indels/ConsensusSequence.java | 51 +++- .../playground/indels/IndelInspector.java | 2 +- .../playground/indels/MultipleAlignment.java | 238 ++++++++++-------- .../sting/playground/indels/PileBuilder.java | 3 + 4 files changed, 181 insertions(+), 113 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/indels/ConsensusSequence.java b/java/src/org/broadinstitute/sting/playground/indels/ConsensusSequence.java index 2847972cd..75ed67954 100644 --- a/java/src/org/broadinstitute/sting/playground/indels/ConsensusSequence.java +++ b/java/src/org/broadinstitute/sting/playground/indels/ConsensusSequence.java @@ -51,7 +51,9 @@ public class ConsensusSequence { // count bases from the sequence into the coverage int posOnConsensus = offset - startOffset; for ( int i = 0 ; i < seq.length() ; i++, posOnConsensus++ ) { - coverage.get(posOnConsensus)[baseToInt(seq.charAt(i))]++; + char base = Character.toUpperCase(seq.charAt(i)); + if ( base == 'N') continue; + coverage.get(posOnConsensus)[baseToInt(base)]++; } } @@ -73,10 +75,24 @@ public class ConsensusSequence { // subtract sequence bases from the coverage int posOnConsensus = offset - startOffset; for ( int i = 0 ; i < seq.length() ; i++, posOnConsensus++ ) { - coverage.get(posOnConsensus)[ baseToInt(seq.charAt(i)) ]--; + char base = Character.toUpperCase(seq.charAt(i)); + if ( base == 'N') continue; + coverage.get(posOnConsensus)[ baseToInt(base) ]--; } } + /** Returns offset of the start of consensus sequence with respect to the reference position the + * consensus is pinned to. + * @return + */ + public int getStartOffset() { return startOffset; } + + /** Returns the length (number of bases) of the consensus sequence. + * + * @return + */ + public int length() { return coverage.size(); } + /** Returns the "distance" (score measuring the agreement) from the currently held consensus sequence to * the specified sequence seq starting at position offset wrt consenus reference position. * @param seq @@ -97,7 +113,7 @@ public class ConsensusSequence { int stop = Math.min(offset+seq.length(), startOffset+coverage.size() ) - offset; for ( ; i < stop ; i++, posOnConsensus++ ) { - int base = baseToInt(seq.charAt(posOnConsensus)); + int base = baseToInt(Character.toUpperCase(seq.charAt(posOnConsensus))); int [] cov = coverage.get(posOnConsensus); int totalcov = cov[0]+cov[1]+cov[2]+cov[3]; @@ -109,7 +125,8 @@ public class ConsensusSequence { * Specified offset must be within the span of currently held consensus sequence. Consensus base is the * one with the maximum count of observations. If two different nucleotides were observed exactly the * same number of times (and that number is greater than the number of observations for othe nucleotides), - * the "lesser" one, (order being ACGT) will be returned. + * the "lesser" one, (order being ACGT) will be returned. If coverage at specified position is zero, 'N' will + * be returned. * @param offset * @return */ @@ -149,6 +166,29 @@ public class ConsensusSequence { return new Pair(base,bmax); } + /** Returns total coverage (all observations regardless of what base what observed) at position + * specified by offset with respect to the conensus' reference position. offset does not have to be within + * the bounds of the currently kept consensus sequence, if it falls outside, a 0 will be silently returned. + * @param offset + * @return + */ + public int coverageAt(int offset) { + if ( offset < startOffset || offset >= startOffset + coverage.size() ) return 0; + int [] cov = coverage.get(offset+(int)referencePos); + return cov[0]+cov[1]+cov[2]+cov[3]; + } + + /** Returns consesus sequence as a astring of bases (ACGTN); N will be returned for positions with zero + * coverage. + * @return + */ + public String getSequence() { + char [] b = new char[coverage.size()]; + for ( int i = 0 ; i < b.length ; i++ ) { + b[i] = baseAt(i+startOffset); + } + return new String(b); + } private List instantiateCoverageList(int n) { List< int[] > subseq = new ArrayList(n); @@ -163,7 +203,8 @@ public class ConsensusSequence { case 'C': base = 1; break; case 'G': base = 2; break; case 'T': base = 3; break; - default : throw new IllegalArgumentException("Sequence can contain only ACGT symbols"); + case 'N': base = -1; break; + default : throw new IllegalArgumentException("Sequence can contain only ACGTN symbols"); } return base; } diff --git a/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java b/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java index 893c1bedf..7116b4264 100755 --- a/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java +++ b/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java @@ -36,7 +36,7 @@ public class IndelInspector extends CommandLineProgram { protected int doWork() { - System.out.println("I am at version 0.2"); + System.out.println("I am at version 0.3"); GenomeLoc location = null; if ( GENOME_LOCATION != null ) { location = GenomeLoc.parseGenomeLoc(GENOME_LOCATION); diff --git a/java/src/org/broadinstitute/sting/playground/indels/MultipleAlignment.java b/java/src/org/broadinstitute/sting/playground/indels/MultipleAlignment.java index 88d759e9e..6586d6301 100755 --- a/java/src/org/broadinstitute/sting/playground/indels/MultipleAlignment.java +++ b/java/src/org/broadinstitute/sting/playground/indels/MultipleAlignment.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.playground.indels; import java.util.*; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.Pair; public class MultipleAlignment implements Iterable { @@ -12,12 +13,14 @@ public class MultipleAlignment implements Iterable { private List alignment_offsets; // offset of seqs[i] w/respect to seqs[0] (i.e. in order the seqs were added) private int best_mm; // mismatch count private int next_mm; // next-best mismatch count - + private ConsensusSequence consensus; + public MultipleAlignment() { index = new HashMap(); seqs = new ArrayList(); alignment_offsets = new ArrayList(); ext_ids = new ArrayList(); + consensus = new ConsensusSequence(); // we use reference position 0, e.g. we hook onto the first read in the pile } public void clear() { @@ -31,37 +34,46 @@ public class MultipleAlignment implements Iterable { * * @param seq sequence to add * @param i id of the sequence (can be use later to query the pile) + * @see #add(String,int,int) */ public void add( String seq, int i ) throws IllegalStateException { if ( size() != 0 ) throw new IllegalStateException("Single sequence can be added to an empty pile only"); - index.put(i,0); - ext_ids.add(i); - seqs.add(seq); - alignment_offsets.add(0); + add(seq,i,0); } + /** Adds single sequence with id set to i and places it at the specified offset wrt the first sequence + * in this pile (i.e. wrt reference position 0). + * + * @param seq sequence to add + * @param i id of the sequence (can be use later to query the pile) + * @see #add(String,int) + */ + public void add( String seq, int i, int offset ) throws IllegalStateException { + index.put(i,index.size()); + ext_ids.add(i); + seqs.add(seq); + alignment_offsets.add(offset); + consensus.addSequence(seq,offset); + } + public void add( PairwiseAlignment a) { if ( a.id1() == -1 || a.id2() == -1 ) throw new IllegalArgumentException("Attempt to add pairwise alignemnt with sequence ids not properly set"); add(a,a.id1(),a.id2()); } /** Adds pair of aligned sequences to the pile, with the external ids of the first and second sequences being i and j, - * respectively. Pairwise alignment can be always added to an empty pile. If the pile is non-empty and either both or - * none of the specified ids are already in the pile, an IllegalStateException will be thrown + * respectively. Pairwise alignment can be always added to an empty pile. If the pile is non-empty, exactly + * one of the sequences held by the pair-wise alignment should be already in the pile; this sequence (and the + * pairwise alignment itself) will be used to stitch the other sequence to the pile. If either both or + * none of the specified ids are already in the pile, an IllegalStateException will be thrown. * @param a * @param i * @param j */ public void add( PairwiseAlignment a, int i, int j ) throws IllegalStateException { if ( seqs.size() == 0 ) { - index.put(i,0); - ext_ids.add(i); - seqs.add(a.getSequence1()); - index.put(j,1); - ext_ids.add(j); - seqs.add(a.getSequence2()); - alignment_offsets.add(0); - alignment_offsets.add(a.getBestOffset2wrt1()); + add(a.getSequence1(),i,0); + add(a.getSequence2(),j,a.getBestOffset2wrt1()); return; } @@ -76,22 +88,49 @@ public class MultipleAlignment implements Iterable { throw new IllegalStateException("Attempt to add pairwise alignment for two sequences none of which is already in the pile"); } - if ( second == null ) { - index.put(j,seqs.size()); - seqs.add(a.getSequence2()); - ext_ids.add(j); - alignment_offsets.add( a.getBestOffset2wrt1() + alignment_offsets.get( first )); - } else { - // first = null - index.put(i,seqs.size()); - seqs.add(a.getSequence1()); - ext_ids.add(i); - alignment_offsets.add( -a.getBestOffset2wrt1() + alignment_offsets.get( second )); - } + if ( second == null ) add(a.getSequence2(),j, a.getBestOffset2wrt1() + alignment_offsets.get( first ) ); + else add(a.getSequence1(),i, -a.getBestOffset2wrt1() + alignment_offsets.get( second ) ); } + /** Adds another pile of aligned sequences to this pile, stitching them together using specified pairwise alignment + * p of the sequences with external ids i and j. One of the indices i, j must be in this pile, and the other in + * the pile being added, otherwise an IllegalArgumentException is thrown. Sequence id's i and j MUST be the ids + * of the first and second sequences in the pairwise alignment, in that order. Specified ids override + * ids, if any, set for the sequences in the pairwise alignment; it is not checked whether the specified and + * stored ids match. The piles can not overlap. + */ + public void add(MultipleAlignment a, PairwiseAlignment p, int i, int j) { + int off2; // offset of the first sequence in pile 'a' wrt the first sequence in this pile + if ( this.contains(i) ) { + if ( ! a.contains(j)) throw new IllegalArgumentException("Sequence is not in the pile"); + off2 = getOffsetById(i)+p.getBestOffset2wrt1()-a.getOffsetById(j); + } else { + if ( this.contains(j)) { + if ( ! a.contains(i)) throw new IllegalArgumentException("Sequence is not in the pile"); + off2 = getOffsetById(j)-p.getBestOffset2wrt1()-a.getOffsetById(i); + } else throw new IllegalArgumentException("Sequence is not in the pile"); + } + // stitch sequences from a into this pile: + for ( Integer id : a ) { + if ( this.contains(id) ) throw new IllegalArgumentException("Attempt to add a pile that shares sequences with the current one"); + add(a.getSequenceById(id),id,off2+a.getOffsetById(id)); + } + } - /** Returns sequence associated with the specified external id, or null if sequence with this external id is not found in the pile + + /** Adds another pile of aligned sequences (a) to this pile, stitching them together using specified + * pairwise alignment p. Sequence ids must be set in the pairwise alignment, and one of those ids + * must be in this pile, and the other in the pile 'a' being added, otherwise an IllegalArgumentException + * is thrown. If pairwise alignment does not have sequence ids set, IllegalArgumentException is thrown. + * The piles can not overlap. + */ + public void add(MultipleAlignment a, PairwiseAlignment p) { + if ( p.id1() == -1 || p.id2() == -1 ) throw new IllegalArgumentException("Attempt to add MSA based on pairwise alignemnt with sequence ids not properly set"); + add(a,p,p.id1(),p.id2()); + } + + /** Returns sequence associated with the specified external id, or null if sequence with this external id is + * not found in the pile * * @param id query id * @return sequence for specified id or null @@ -101,49 +140,21 @@ public class MultipleAlignment implements Iterable { return seqs.get(index.get(id)); } - /** Returns offset relative to the first sequence in the pile for sequence associated with the specified external id + /** Returns offset relative to the first sequence in the pile for sequence associated with the specified + * external id. If sequence with specified id is not found in the pile, RuntimeException is thrown. * * @param id query id * @return offset for sequence with specified id */ public int getOffsetById(int id) { - //TODO: do something meaningful when id is not in the pile (exception?) - if ( ! contains(id)) return 0; + if ( ! contains(id) ) throw new RuntimeException("Specified id is not in the pile"); return alignment_offsets.get(index.get(id)); } - /** Adds pile of aligned sequences to this pile, stitching them together using specified pairwise alignment - * p of the sequences with external ids i and j. One of the indices i, j must be in this pile, and the other in the pile being added, - * otherwise an IllegalArgumentException is thrown. The piles can not overlap. - */ - public void add(MultipleAlignment a, PairwiseAlignment p, int i, int j) { - int off2; // offset of the first sequence in pile 'a' wrt the first sequence in this pile - if ( this.contains(i) ) { - if ( ! a.contains(j)) throw new IllegalArgumentException("Sequence is not in the pile"); - off2 = getOffsetById(i)+p.getBestOffset2wrt1()-a.getOffsetById(j); - } else { - if ( this.contains(j)) { - if ( ! a.contains(i)) throw new IllegalArgumentException("Sequence is not in the pile"); - off2 = getOffsetById(j)-p.getBestOffset2wrt1()-a.getOffsetById(i); - } else throw new IllegalArgumentException("Sequence is not in the pile"); - } - // stitch sequences from a into this pile: - for ( Integer id : a ) { - if ( this.contains(id) ) throw new IllegalArgumentException("Attempt to add a pile that shares sequences with the current one"); - index.put(id,seqs.size()); - ext_ids.add(id); - seqs.add(a.getSequenceById(id)); - alignment_offsets.add(off2+a.getOffsetById(id)); - } - } - public void add(MultipleAlignment a, PairwiseAlignment p) { - if ( p.id1() == -1 || p.id2() == -1 ) throw new IllegalArgumentException("Attempt to add MSA based on pairwise alignemnt with sequence ids not properly set"); - add(a,p,p.id1(),p.id2()); - } - /** Returns true if the alignment already contains sequence with the specified id + /** Returns true if the alignment already contains sequence with the specified id. * * @param id * @return @@ -151,7 +162,15 @@ public class MultipleAlignment implements Iterable { public boolean contains(int id) { return index.containsKey(id); } - + + /** Returns number of mismatches between sequences i and j (external ids) in the currently held multiple alignment. + * Will return 0 if sequences do not overlap. Will throw RuntimeException if any of the specified ids is not + * found in the current pile. + * @param i id of the first sequence + * @param j id of the second sequence + * @return mismatch count + * + * */ public int countMismatches(int i, int j) { return PairwiseAlignment.countMismatches(getSequenceById(i), getSequenceById(j), getOffsetById(j)-getOffsetById(i)); } @@ -216,58 +235,63 @@ public class MultipleAlignment implements Iterable { if ( seqs.size() == 0 ) return b.toString(); - int skip_first = 0; - int msa_length = 0; - for ( int i = 0 ; i < seqs.size() ; i++ ) { - if ( -alignment_offsets.get(i) > skip_first ) skip_first = -alignment_offsets.get(i); - msa_length = Math.max( alignment_offsets.get(i)+seqs.get(i).length() , msa_length ); - } +// int first_offset = 0; // offset of the first sequence wrt the start of the MSA + final int first_offset = -consensus.getStartOffset(); +// int msa_length = 0; +// for ( int i = 0 ; i < seqs.size() ; i++ ) { +// if ( -alignment_offsets.get(i) > first_offset ) first_offset = -alignment_offsets.get(i); +// msa_length = Math.max( alignment_offsets.get(i)+seqs.get(i).length() , msa_length ); +// } - msa_length += skip_first; - int [] cov = new int[4]; - char[] bases = { 'A' , 'C', 'G', 'T' }; - char[][] consensus = new char[4][msa_length]; +// msa_length += first_offset; + final int msa_length = consensus.length(); +// int [] cov = new int[4]; +// char[] bases = { 'A' , 'C', 'G', 'T' }; + char[][] consensusString = new char[4][msa_length]; - for ( int i = 0 ; i < msa_length ; i++ ) { - cov[0] = cov[1] = cov[2] = cov[3] = 0; - for ( int j = 0 ; j < seqs.size(); j++ ) { - // offset of the sequence j wrt start of the msa region - int seq_offset = skip_first + alignment_offsets.get(j); - if ( i < seq_offset || i >= seq_offset + seqs.get(j).length() ) continue; // sequence j has no bases at position i - int base = -1; - switch( Character.toUpperCase(seqs.get(j).charAt(i-seq_offset)) ) { - case 'A': base = 0; break; - case 'C': base = 1 ; break; - case 'G': base = 2 ; break; - case 'T': base = 3 ; break; - } - if ( base >= 0 ) cov[base]++; - } - int total_cov = cov[0] + cov[1] + cov[2] + cov[3]; - int bmax = 0; - int mm = 0; - consensus[3][i] = 'N'; - for ( int z = 0; z < 4 ; z++ ) { - if ( cov[z] > bmax ) { - bmax = cov[z]; - consensus[3][i] = bases[z]; - mm = total_cov - bmax; - } - } + for ( int i = -first_offset ; i < msa_length - first_offset ; i++ ) { +// cov[0] = cov[1] = cov[2] = cov[3] = 0; +// for ( int j = 0 ; j < seqs.size(); j++ ) { +// // offset of the sequence j wrt start of the msa region +// int seq_offset = first_offset + alignment_offsets.get(j); +// if ( i < seq_offset || i >= seq_offset + seqs.get(j).length() ) continue; // sequence j has no bases at position i +// int base = -1; +// switch( Character.toUpperCase(seqs.get(j).charAt(i-seq_offset)) ) { +// case 'A': base = 0; break; +// case 'C': base = 1 ; break; +// case 'G': base = 2 ; break; +// case 'T': base = 3 ; break; +// } +// if ( base >= 0 ) cov[base]++; +// } +// int total_cov = cov[0] + cov[1] + cov[2] + cov[3]; +// int bmax = 0; +// int mm = 0; + + Pair base = consensus.baseWithCountAt(i-first_offset); + consensusString[3][i] = base.first; +// for ( int z = 0; z < 4 ; z++ ) { +// if ( cov[z] > bmax ) { +// bmax = cov[z]; +// consensus[3][i] = bases[z]; +// mm = total_cov - bmax; +// } +// } + int mm = consensus.coverageAt(i) - base.second; if ( mm > 0 ) { - consensus[2][i] = '*'; - if ( mm > 9 ) consensus[0][i] = Character.forDigit(mm/10,10); - else consensus[0][i] = ' '; - consensus[1][i] = Character.forDigit(mm%10,10); + consensusString[2][i] = '*'; + if ( mm > 9 ) consensusString[0][i] = Character.forDigit(mm/10,10); + else consensusString[0][i] = ' '; + consensusString[1][i] = Character.forDigit(mm%10,10); } else { - consensus[0][i] = consensus[1][i] = consensus[2][i] = ' '; + consensusString[0][i] = consensusString[1][i] = consensusString[2][i] = ' '; } } - b.append(" "); b.append(consensus[0]); b.append('\n'); - b.append(" "); b.append(consensus[1]); b.append('\n'); - b.append(" "); b.append(consensus[2]); b.append('\n'); - b.append(" "); b.append(consensus[3]); b.append('\n'); + b.append(" "); b.append(consensusString[0]); b.append('\n'); + b.append(" "); b.append(consensusString[1]); b.append('\n'); + b.append(" "); b.append(consensusString[2]); b.append('\n'); + b.append(" "); b.append(consensusString[3]); b.append('\n'); Integer[] perm = null; if ( inorder ) perm = Utils.SortPermutation(alignment_offsets); @@ -275,7 +299,7 @@ public class MultipleAlignment implements Iterable { for ( int i = 0 ; i < seqs.size() ; i++ ) { int index = (inorder ? perm[i] : i); frmt.format("%3d:", ext_ids.get(index)); - skipN(alignment_offsets.get(index)+skip_first,b); + skipN(alignment_offsets.get(index)+ first_offset,b); b.append(seqs.get(index)); b.append('\n'); } diff --git a/java/src/org/broadinstitute/sting/playground/indels/PileBuilder.java b/java/src/org/broadinstitute/sting/playground/indels/PileBuilder.java index ceb6bf8d6..4b4712a37 100755 --- a/java/src/org/broadinstitute/sting/playground/indels/PileBuilder.java +++ b/java/src/org/broadinstitute/sting/playground/indels/PileBuilder.java @@ -103,6 +103,9 @@ public class PileBuilder implements RecordPileReceiver { System.out.println("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") ; List piles = doMultipleAlignment2(seqs); + + + // System.out.print("Distance between final piles: "+distance(alignments1, alignments2)); // System.out.print("; diameter of PILE1: "+ diameter(alignments1)); // System.out.println("; diameter of PILE2: "+ diameter(alignments2));