From f47a214f96330f59262d260e8344085a05c13be9 Mon Sep 17 00:00:00 2001 From: asivache Date: Thu, 26 Mar 2009 02:26:17 +0000 Subject: [PATCH] massive changes everywhere; lots of bugs fixed; methods moved around; computation and printout of overall stats added; now decides whether to accept or reject 'improvement'; writes alignments into two output sam files (unmodified reads/failed piles into one, realigned piles into the other); special treat for paranoids: writes third sam file with all the analyzed reads, unmodified git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@197 348d0f76-0448-11de-a6fe-93d51630548a --- .../playground/indels/AlignmentUtils.java | 9 +- .../sting/playground/indels/Indel.java | 6 +- .../playground/indels/IndelInspector.java | 33 +- .../indels/IndelRecordPileCollector.java | 25 +- .../playground/indels/MultipleAlignment.java | 16 +- .../playground/indels/PassThroughWriter.java | 34 ++ .../sting/playground/indels/PileBuilder.java | 375 ++++++++++++++---- .../indels/SWPairwiseAlignment.java | 48 ++- .../sting/playground/indels/SequencePile.java | 2 +- 9 files changed, 429 insertions(+), 119 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/playground/indels/PassThroughWriter.java diff --git a/java/src/org/broadinstitute/sting/playground/indels/AlignmentUtils.java b/java/src/org/broadinstitute/sting/playground/indels/AlignmentUtils.java index 5a72eec3c..fd0d41230 100644 --- a/java/src/org/broadinstitute/sting/playground/indels/AlignmentUtils.java +++ b/java/src/org/broadinstitute/sting/playground/indels/AlignmentUtils.java @@ -113,8 +113,14 @@ public class AlignmentUtils { Indel curr_indel = null; switch(ce.getOperator()) { - case I: curr_indel = new Indel(runninglength, ce.getLength(), Indel.IndelType.I); break; + case I: + curr_indel = new Indel(runninglength, ce.getLength(), Indel.IndelType.I); + if ( i == 0 ) System.out.println("WARNING: Indel at start!"); + if ( i == c.numCigarElements() - 1) System.out.println("WARNING: Indel at end!"); + break; case D: curr_indel = new Indel(runninglength, ce.getLength(), Indel.IndelType.D); + if ( i == 0 ) System.out.println("WARNING: Indel at start!"); + if ( i == c.numCigarElements() - 1) System.out.println("WARNING: Indel at end!"); runninglength += ce.getLength(); break; case M: runninglength += ce.getLength(); break; // advance along the gapless block in the alignment @@ -152,6 +158,7 @@ public class AlignmentUtils { for ( Indel ind : indels ) { CountedObject ci = new CountedObject(ind); CountedObject found = t.floor(ci); +// CountedObject found2 = t.ceiling(ci); if ( ci.equals( found ) ) found.increment(); // we did find our indel, advance the counter else t.add(ci); // this is a new indel. Add it. diff --git a/java/src/org/broadinstitute/sting/playground/indels/Indel.java b/java/src/org/broadinstitute/sting/playground/indels/Indel.java index b5ec9b711..cd79b77a5 100755 --- a/java/src/org/broadinstitute/sting/playground/indels/Indel.java +++ b/java/src/org/broadinstitute/sting/playground/indels/Indel.java @@ -114,7 +114,7 @@ public class Indel implements Interval { * @param i Another interval * @return true iff intervals overlap */ - @Override + public boolean overlapsP(Interval i) { return ! disjointP(i); //To change body of implemented methods use File | Settings | File Templates. } @@ -126,7 +126,6 @@ public class Indel implements Interval { * @param i Another interval * @return true iff intervals do not overlap */ - @Override public boolean disjointP(Interval i) { return i.getStop() < this.getStart() || i.getStart() > this.getStop(); } @@ -135,7 +134,6 @@ public class Indel implements Interval { * has length of 0. * @return length of the event on the original, unmodified reference */ - @Override public long getLength() { if ( mType == IndelType.I ) return 0; return mLength; @@ -150,6 +148,6 @@ public class Indel implements Interval { @Override public int hashCode() { - return (int)( mStart << 2 + mLength ); + return (int)( mStart << 6 + mStart + mLength ); } } diff --git a/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java b/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java index 16d466213..ebee842f7 100755 --- a/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java +++ b/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java @@ -11,8 +11,6 @@ import javax.swing.filechooser.FileNameExtensionFilter; import edu.mit.broad.picard.cmdline.CommandLineProgram; import edu.mit.broad.picard.cmdline.Option; import edu.mit.broad.picard.cmdline.Usage; -import edu.mit.broad.picard.reference.ReferenceSequenceFile; -import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory; import edu.mit.broad.picard.reference.ReferenceSequenceFileWalker; import edu.mit.broad.picard.reference.ReferenceSequence; @@ -25,6 +23,10 @@ public class IndelInspector extends CommandLineProgram { @Usage(programVersion="1.0") public String USAGE = "Investigates indels called in the alignment data\n"; @Option(shortName="I", doc="SAM or BAM file for calling",optional=true) public File INPUT_FILE; @Option(shortName="L",doc="Genomic interval to run on, as contig[:start[-stop]]; whole genome if not specified", optional=true) public String GENOME_LOCATION; + @Option(shortName="V",doc="Verbosity level: SILENT, PILESUMMARY, ALIGNMENTS", optional=true) public String VERBOSITY_LEVEL; + @Option(doc="Output file (sam or bam) for non-indel related reads and indel reads that were not improved") public String OUT1; + @Option(doc="Output file (sam or bam) for improved (realigned) indel related reads") public String OUT2; + @Option(doc="[paranoid] Output \"control\" file (sam or bam): all reads picked and processed by this tool will be also saved, unmodified, into this file", optional=true) public String OUTC; @Option(doc="Error counting mode: MM - count mismatches only, ERR - count errors (arachne style), MG - count mismatches and gaps as one error each") public String ERR_MODE; @Option(doc="Maximum number of errors allowed (see ERR_MODE)") public Integer MAX_ERRS; // @Option(shortName="R", doc="Reference fasta or fasta.gz file") public File REF_FILE; @@ -46,7 +48,7 @@ public class IndelInspector extends CommandLineProgram { System.out.println("Unknown value specified for ERR_MODE"); return 1; } - + final SAMFileReader samReader = new SAMFileReader(getInputFile(INPUT_FILE,"/broad/1KG/")); samReader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT); @@ -60,12 +62,26 @@ public class IndelInspector extends CommandLineProgram { ReferenceSequence contig_seq = null; IndelRecordPileCollector col = null; - PileBuilder pileBuilder = new PileBuilder(); + PassThroughWriter ptWriter = new PassThroughWriter(OUT1,samReader.getFileHeader()); + PileBuilder pileBuilder = new PileBuilder(OUT2,samReader.getFileHeader(),ptWriter); + + SAMFileWriter controlWriter = null; + if ( OUTC != null ) controlWriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(samReader.getFileHeader(),false,new File(OUTC)); + try { - col = new IndelRecordPileCollector(new DiscardingReceiver(), pileBuilder ); + col = new IndelRecordPileCollector(ptWriter, pileBuilder, controlWriter ); } catch(Exception e) { System.err.println(e.getMessage()); } if ( col == null ) return 1; - + + if ( VERBOSITY_LEVEL == null ) VERBOSITY_LEVEL = new String("SILENT"); + if ( VERBOSITY_LEVEL.toUpperCase().equals("SILENT")) pileBuilder.setVerbosity(pileBuilder.SILENT); + else if ( VERBOSITY_LEVEL.toUpperCase().equals("PILESUMMARY") ) pileBuilder.setVerbosity(pileBuilder.PILESUMMARY); + else if ( VERBOSITY_LEVEL.toUpperCase().equals("ALIGNMENTS") ) pileBuilder.setVerbosity(pileBuilder.ALIGNMENTS); + else { + System.out.println("Unrecognized VERBOSITY_LEVEL setting."); + return 1; + } + String cur_contig = null; int counter = 0; @@ -120,9 +136,14 @@ public class IndelInspector extends CommandLineProgram { col.receive(r); } + + pileBuilder.printStats(); System.out.println("done."); col.printLengthHistograms(); samReader.close(); + pileBuilder.close(); + ptWriter.close(); + if ( controlWriter != null ) controlWriter.close(); return 0; } diff --git a/java/src/org/broadinstitute/sting/playground/indels/IndelRecordPileCollector.java b/java/src/org/broadinstitute/sting/playground/indels/IndelRecordPileCollector.java index 37f0965a5..eb01d5d08 100755 --- a/java/src/org/broadinstitute/sting/playground/indels/IndelRecordPileCollector.java +++ b/java/src/org/broadinstitute/sting/playground/indels/IndelRecordPileCollector.java @@ -89,6 +89,8 @@ public class IndelRecordPileCollector implements RecordReceiver { private RecordReceiver defaultReceiver; // we will send there records that do not overlap with regions of interest private RecordPileReceiver indelPileReceiver; // piles over indel regions will be sent there + private SAMFileWriter controlWriter; + private String referenceSequence; public String memStatsString() { @@ -97,7 +99,12 @@ public class IndelRecordPileCollector implements RecordReceiver { //+" Bndries="+mIndelRegionStart +":"+ mIndelRegionStop; } - public IndelRecordPileCollector(RecordReceiver rr, RecordPileReceiver rp) throws java.io.IOException { + public IndelRecordPileCollector(RecordReceiver rr, RecordPileReceiver rp) throws java.io.IOException { + this(rr,rp,null); + } + + + public IndelRecordPileCollector(RecordReceiver rr, RecordPileReceiver rp, SAMFileWriter cw) throws java.io.IOException { mRecordPile = new LinkedList(); mAllIndels = new TreeSet >( new CountedObjectComparatorAdapter(new IntervalComparator())); @@ -113,6 +120,7 @@ public class IndelRecordPileCollector implements RecordReceiver { defaultReceiver = rr; indelPileReceiver = rp; referenceSequence = null; + controlWriter = cw; setWaitState(); } @@ -146,6 +154,7 @@ public class IndelRecordPileCollector implements RecordReceiver { SAMRecord r = i.next(); if ( r.getAlignmentEnd() <= pos ) { defaultReceiver.receive(r); + if ( controlWriter != null ) controlWriter.addAlignment(r); i.remove(); } else break; } @@ -161,6 +170,7 @@ public class IndelRecordPileCollector implements RecordReceiver { SAMRecord r = i.next(); if ( r.getAlignmentStart() >= pos ) { defaultReceiver.receive(r); + if ( controlWriter != null ) controlWriter.addAlignment(r); i.remove(); } else break; } @@ -250,7 +260,10 @@ public class IndelRecordPileCollector implements RecordReceiver { } // no indels or avoiding indels in bad region: send all records to defaultReceiver and clear the pile - for ( SAMRecord r : mRecordPile ) defaultReceiver.receive(r); + for ( SAMRecord r : mRecordPile ) { + defaultReceiver.receive(r); + if ( controlWriter != null ) controlWriter.addAlignment(r); + } setWaitState(); return; } @@ -325,7 +338,13 @@ public class IndelRecordPileCollector implements RecordReceiver { System.out.print(finalPile.size() + " reads in the pile;") ; System.out.println(formatRange(finalTrain)); indelPileReceiver.receive(finalPile); - } else for ( SAMRecord r : finalPile ) defaultReceiver.receive(r); + if ( controlWriter != null ) for ( SAMRecord r : finalPile ) controlWriter.addAlignment(r); + } else { + for ( SAMRecord r : finalPile ) { + defaultReceiver.receive(r); + controlWriter.addAlignment(r); + } + } finalPile.clear(); finalTrain.clear(); curr_stop = -1; diff --git a/java/src/org/broadinstitute/sting/playground/indels/MultipleAlignment.java b/java/src/org/broadinstitute/sting/playground/indels/MultipleAlignment.java index 94dbd6c33..fc9d635d7 100755 --- a/java/src/org/broadinstitute/sting/playground/indels/MultipleAlignment.java +++ b/java/src/org/broadinstitute/sting/playground/indels/MultipleAlignment.java @@ -240,7 +240,7 @@ public class MultipleAlignment implements Iterable { /** Returns a (multiline) string that represents the alignment visually: the sequences are appropriately * shifted and ready for printout; */ - public String toString(boolean inorder) { + public String toString(boolean inorder, boolean dotprint) { StringBuilder b = new StringBuilder(); java.util.Formatter frmt = new java.util.Formatter(b); @@ -278,8 +278,16 @@ public class MultipleAlignment implements Iterable { for ( int i = 0 ; i < seqs.size() ; i++ ) { int index = (inorder ? perm[i] : i); frmt.format("%3d:", ext_ids.get(index)); - skipN(alignment_offsets.get(index)+ first_offset,b); - b.append(seqs.get(index)); + int pos = alignment_offsets.get(index)+ first_offset; // start position on the consensus sequence + skipN(pos,b); + String aSeq = seqs.get(index); + if ( dotprint ) { + for ( int j = 0 ; j < aSeq.length() ; j++, pos++ ) { + if ( Character.toUpperCase(aSeq.charAt(j)) == + Character.toUpperCase(consensusString[3][pos]) ) b.append('.'); + else b.append(aSeq.charAt(j)); + } + } else b.append(aSeq); b.append('\n'); } // b.append(best_mm+" mismatches, "+ next_mm + " next best, " + getOverlap() + " overlapping bases, distance=" + distance() + "\n"); @@ -290,7 +298,7 @@ public class MultipleAlignment implements Iterable { return consensus.getSequence(); } - public String toString() { return toString(true); } + public String toString() { return toString(true, false); } public int size() { return seqs.size(); } diff --git a/java/src/org/broadinstitute/sting/playground/indels/PassThroughWriter.java b/java/src/org/broadinstitute/sting/playground/indels/PassThroughWriter.java new file mode 100644 index 000000000..26b651bce --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/indels/PassThroughWriter.java @@ -0,0 +1,34 @@ +package org.broadinstitute.sting.playground.indels; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMFileWriter; +import net.sf.samtools.SAMFileWriterFactory; +import net.sf.samtools.SAMFileHeader; + +import java.io.File; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: Mar 25, 2009 + * Time: 8:27:09 PM + * To change this template use File | Settings | File Templates. + */ +public class PassThroughWriter implements RecordReceiver { + private SAMFileWriter writer; + + public PassThroughWriter( File f, SAMFileHeader h) { + writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(h, false, f); + } + + public PassThroughWriter(String s, SAMFileHeader h) { + this(new File(s), h); + } + + public void receive(SAMRecord r) { + //To change body of implemented methods use File | Settings | File Templates. + writer.addAlignment(r); + } + + public void close() { writer.close() ; } +} diff --git a/java/src/org/broadinstitute/sting/playground/indels/PileBuilder.java b/java/src/org/broadinstitute/sting/playground/indels/PileBuilder.java index 56222e238..72926831d 100755 --- a/java/src/org/broadinstitute/sting/playground/indels/PileBuilder.java +++ b/java/src/org/broadinstitute/sting/playground/indels/PileBuilder.java @@ -3,6 +3,8 @@ package org.broadinstitute.sting.playground.indels; import net.sf.samtools.*; import java.util.*; +import java.io.File; + import org.broadinstitute.sting.utils.PrimitivePair; import org.broadinstitute.sting.playground.utils.CountedObject; import org.broadinstitute.sting.playground.utils.CountedObjectComparatorAdapter; @@ -17,6 +19,30 @@ public class PileBuilder implements RecordPileReceiver { private String referenceSequence; private int reference_start; + private int processed_piles = 0; + private int improved_piles = 0; + private int unmodified_piles = 0; + private int failed_piles = 0; + private int indels_improved = 0; + private int indel_improvement_cnt = 0; + private int indels_discarded = 0; + private int indels_added = 0; + private int indels_added_cnt = 0; + private int total_mismatches_count_in_improved = 0; + private int total_mismatches_count_in_failed = 0; + private int total_improved_mismatches_count = 0; + private int total_reads_in_improved = 0; + private int total_reads_in_failed = 0; + private int total_alignments_modified = 0; + + public final static int SILENT = 0; + public final static int PILESUMMARY = 1; + public final static int ALIGNMENTS = 2; + + private int mVerbosityLevel = SILENT; + + private SAMFileWriter samWriter; + private RecordReceiver failedPileReceiver; private static class SelectedPair { private int i_; @@ -68,9 +94,15 @@ public class PileBuilder implements RecordPileReceiver { } - public PileBuilder() { + public PileBuilder(File f, SAMFileHeader h, RecordReceiver fr) { + samWriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(h,false,f); referenceSequence = null; reference_start = -1; + failedPileReceiver = fr; + } + + public PileBuilder(String s, SAMFileHeader h, RecordReceiver fr) { + this(new File(s),h, fr); } public void setReferenceSequence(String seq, int start) { @@ -84,6 +116,10 @@ public class PileBuilder implements RecordPileReceiver { } public void receive(Collection c) { + + //TODO: if read starts/ends with an indel (insertion, actually), we detect this as a "different" indel introduced during cleanup. + processed_piles++; + IndexedSequence[] seqs = new IndexedSequence[c.size()]; int i = 0; int startOnRef = 1000000000; // absolute start (leftmost) position of the pile of reads on the ref @@ -101,18 +137,24 @@ public class PileBuilder implements RecordPileReceiver { TreeSet< CountedObject > all_indels = new TreeSet< CountedObject >( new CountedObjectComparatorAdapter(new IntervalComparator())); - SequencePile originalAligns = new SequencePile(pileRef); + SequencePile originalAligns = null; + if ( mVerbosityLevel >= ALIGNMENTS ) originalAligns = new SequencePile(pileRef); + for ( SAMRecord r : c ) { - originalAligns.addAlignedSequence(r.getReadString(), r.getReadNegativeStrandFlag(), - r.getCigar(), r.getAlignmentStart() - startOnRef ); + if ( mVerbosityLevel >= ALIGNMENTS ) { + originalAligns.addAlignedSequence(r.getReadString(), r.getReadNegativeStrandFlag(), + r.getCigar(), r.getAlignmentStart() - startOnRef ); + } totalMismatches += AlignmentUtils.numMismatches(r,referenceSequence); AlignmentUtils.collectAndCountIndels(r,all_indels); } - System.out.println("\n#############################################################################"); - System.out.println("ORIGINAL ALIGNMENT: \n"); - originalAligns.dotprint(true); - System.out.println("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") ; + if ( mVerbosityLevel >= ALIGNMENTS ) { + System.out.println("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"); + System.out.println("ORIGINAL ALIGNMENT: \n"); + originalAligns.dotprint(true); + System.out.println("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") ; + } List piles = doMultipleAlignment2(seqs); @@ -120,91 +162,244 @@ public class PileBuilder implements RecordPileReceiver { // System.out.print("; diameter of PILE1: "+ diameter(alignments1)); // System.out.println("; diameter of PILE2: "+ diameter(alignments2)); - SymmetricMatrix d = new SymmetricMatrix(piles.size()); - for ( int n = 0 ; n < piles.size() ; n++ ) { - d.set(n,n,diameter(piles.get(n))); - for ( int m = n+1 ; m < piles.size() ; m++ ) { - d.set(n,m,distance(piles.get(n), piles.get(m))); + SymmetricMatrix d = new SymmetricMatrix(piles.size()); + for ( int n = 0 ; n < piles.size() ; n++ ) { + d.set(n,n,diameter(piles.get(n))); + for ( int m = n+1 ; m < piles.size() ; m++ ) { + d.set(n,m,distance(piles.get(n), piles.get(m))); + } } - } - int new_mismatches = 0 ; // number of mismatches after re-alignment: - TreeSet< CountedObject > new_indels = new TreeSet< CountedObject >( - new CountedObjectComparatorAdapter(new IntervalComparator()) - ); // new indels after realignment - int shifted_reads = 0; - int smashed_reads = 0; + int new_mismatches = 0 ; // number of mismatches after re-alignment: + TreeSet< CountedObject > new_indels = new TreeSet< CountedObject >( + new CountedObjectComparatorAdapter(new IntervalComparator()) + ); // new indels after realignment + int shifted_reads = 0; + int smashed_reads = 0; - List as_list = (List)c; // ugly hack; need this to access records by ids + List as_list = (List)c; // ugly hack; need this to access records by ids - System.out.println(d.format("%8.4g")); - for ( int n = 0 ; n < piles.size() ; n++ ) { + if ( mVerbosityLevel >= PILESUMMARY ) System.out.println(d.format("%8.4g")); + + for ( int n = 0 ; n < piles.size() ; n++ ) { // SWPairwiseAlignment consToRef = new SWPairwiseAlignment(pileRef,piles.get(n).getConsensus(),2.0,-10.0,-2.0,-1.0); + SWPairwiseAlignment consToRef = new SWPairwiseAlignment(pileRef,piles.get(n).getConsensus(),3.0,-1.0,-4.0,-0.5); + + if ( mVerbosityLevel >= ALIGNMENTS ) { + + System.out.println("PILE " + n + " to REF ("+ (consToRef.getCigar().numCigarElements()-1)/2 +" indels):"); + System.out.println(consToRef.toString()); + System.out.println("PILE " + n +" (READS):\n" +piles.get(n).toString(true,true)); + } +// SequencePile pileAligns = new SequencePile(pileRef); + + MultipleAlignment ma = piles.get(n); + for ( Integer id : ma ) { + SAMRecord r = as_list.get(id); + int cons_offset = ma.getOffsetWrtConsensus(id); // offset of the read 'id' wrt multiple alignment's full consensus seq + +/* + System.out.println("id=" + id +": offset on consensus="+cons_offset+ + "; consensus wrt ref chunk="+consToRef.getAlignmentStart2wrt1()+"; chunk start="+startOnRef); +*/ + + int ref_offset = cons_offset + startOnRef + consToRef.getAlignmentStart2wrt1()+indelCorrection(cons_offset,consToRef.getCigar()); + if ( ref_offset != r.getAlignmentStart()) shifted_reads++; + Cigar cig = buildCigar(cons_offset, r.getReadLength(), consToRef.getCigar()); +/* + if ( id == 9 ) { + System.out.println("ref_offset="+ref_offset+"; orig_ref_off="+r.getAlignmentStart()+"; "+ + AlignmentUtils.toString(cig)); + } + + System.out.println("adding "+id+" at "+ (ref_offset - refStarttemp)); + pileAligns.addAlignedSequence(r.getReadString(), r.getReadNegativeStrandFlag(), cig, ref_offset - refStarttemp); +*/ + if ( cig.numCigarElements() != r.getCigar().numCigarElements() ) smashed_reads++; + + if ( ref_offset != r.getAlignmentStart() || cig.numCigarElements() != r.getCigar().numCigarElements() ) total_alignments_modified++; + + SAMRecord rtest = new SAMRecord(r.getHeader()); + rtest.setAlignmentStart(ref_offset); + rtest.setReadString(r.getReadString()); + rtest.setReadUmappedFlag(r.getReadUnmappedFlag()); + rtest.setCigar(cig); + AlignmentUtils.collectAndCountIndels(rtest,new_indels); + new_mismatches += AlignmentUtils.numMismatches(rtest,referenceSequence); + } + // pileAligns.colorprint(true); + } + + boolean pile_improved = false; + boolean pile_unmodified = false; + boolean pile_failed = false; + + double mmChangePct = Math.abs((new_mismatches - totalMismatches)*100.0/totalMismatches); + + if ( shifted_reads == 0 && smashed_reads == 0 ) pile_unmodified = true; + else { + if ( new_mismatches < totalMismatches || + mmChangePct < 10.0 && ( new_indels.size() < all_indels.size() ) + ) pile_improved = true; + else pile_failed = true; + } + + if ( pile_improved ) { + total_mismatches_count_in_improved +=totalMismatches; + total_improved_mismatches_count += new_mismatches; + total_reads_in_improved += c.size() ; + } + + if ( pile_failed ) { + total_mismatches_count_in_failed += totalMismatches; + total_reads_in_failed += c.size(); + } + int discovered_indels = 0; + int discovered_support = 0; + int existing_indels = 0; + int existing_support = 0; + int existing_support_new = 0; + int discarded_indels = 0; + for ( CountedObject ind : new_indels ) { + //System.out.print("new indel: "+ind.getObject().getStart()+"+"+ind.getObject().getStop()); + if ( ! all_indels.contains(ind) ) { + //System.out.println(" (DISCOVERED)"); + discovered_indels++; + discovered_support += ind.getCount(); + if ( pile_improved ) { + indels_added++; + indels_added_cnt += ind.getCount(); + } + } else { + //System.out.println(" (EXISTING)"); + existing_indels++; + existing_support_new += ind.getCount(); + if ( pile_improved && ( ind.getCount() > all_indels.floor(ind).getCount() ) ) { + if ( ! ind.equals(all_indels.floor(ind))) System.out.println("ERROR MATCHING INDELS!!!") ; + indels_improved++; + indel_improvement_cnt += ( ind.getCount() - all_indels.floor(ind).getCount() ); + } + } + } + for ( CountedObject ind : all_indels ) { + //System.out.print("old indel: "+ind.getObject().getStart()+"+"+ind.getObject().getStop()); + if ( ! new_indels.contains(ind )) { + //System.out.println(" (DISCARDED)"); + discarded_indels++; + if ( pile_improved ) indels_discarded++; + } else { + //System.out.println(" (KEPT)"); + existing_support += ind.getCount(); + } + } + + if ( pile_improved ) improved_piles++; + if ( pile_unmodified ) unmodified_piles++; + if ( pile_failed ) failed_piles++; + + if ( mVerbosityLevel >= PILESUMMARY ) { + System.out.print("TOTAL MISMATCHES: "+totalMismatches +" --> "+new_mismatches); + if ( totalMismatches > new_mismatches ) System.out.print("(-"); + else System.out.print("(+"); + System.out.printf("%.2f%%)%n",mmChangePct); + + System.out.println("CONFIRMED INDELS: "+existing_indels); + System.out.print("CONFIRMED INDEL SUPPORT: "+existing_support + " --> " + existing_support_new ); + if ( existing_support > existing_support_new ) System.out.print("(-"); + else System.out.print("(+"); + System.out.printf("%.2f%%)%n",Math.abs((existing_support- existing_support_new)*100.0/existing_support)); + System.out.println("DROPPED INDELS: " + discarded_indels); + System.out.println("DISCOVERED INDELS: " + discovered_indels) ; + System.out.println("DISCOVERED INDELS SUPPORT: "+discovered_support); + System.out.println("ALIGNMENTS SHIFTED: "+shifted_reads); + System.out.println("ALIGNMENTS WITH GAPS CHANGED: "+smashed_reads); + + if ( pile_improved ) System.out.println("OUTCOME: IMPROVED"); + if ( pile_unmodified ) System.out.println("OUTCOME: UNCHANGED"); + if ( pile_failed ) System.out.println("OUTCOME: FAILED"); + + System.out.println("\n#############################################################################\n"); + } + // finally, writing stuff: + for ( int n = 0 ; n < piles.size() ; n++ ) { + SWPairwiseAlignment consToRef = new SWPairwiseAlignment(pileRef,piles.get(n).getConsensus(),3.0,-1.0,-4.0,-0.5); - - System.out.println("PILE " + n + " to REF ("+ (consToRef.getCigar().numCigarElements()-1)/2 +" indels):"); - System.out.println(consToRef.toString()); - System.out.println("PILE " + n +" (READS):\n" +piles.get(n).toString()); - MultipleAlignment ma = piles.get(n); - for ( Integer id : ma ) { + + Iterator id_iter = ma.sequenceIdByOffsetIterator(); + while ( id_iter.hasNext() ) { + + int id = id_iter.next(); + SAMRecord r = as_list.get(id); + if ( pile_failed || pile_unmodified ) { + failedPileReceiver.receive(r); // nothing to do, send failed piles directly for writing + continue; + } + // we improved stuff!! let's reset the alignment parameters! + int cons_offset = ma.getOffsetWrtConsensus(id); // offset of the read 'id' wrt multiple alignment's full consensus seq - int ref_offset = cons_offset + startOnRef + consToRef.getAlignmentStart2wrt1(); - if ( ref_offset != r.getAlignmentStart()) shifted_reads++; + + // offset of the realigned read r on the reference + int ref_offset = cons_offset + startOnRef + consToRef.getAlignmentStart2wrt1()+indelCorrection(cons_offset,consToRef.getCigar()); + + r.setAlignmentStart(ref_offset); + Cigar cig = buildCigar(cons_offset, r.getReadLength(), consToRef.getCigar()); - if ( cig.numCigarElements() != r.getCigar().numCigarElements() ) smashed_reads++; - SAMRecord rtest = new SAMRecord(r.getHeader()); - rtest.setAlignmentStart(ref_offset); - rtest.setReadString(r.getReadString()); - rtest.setReadUmappedFlag(r.getReadUnmappedFlag()); - rtest.setCigar(cig); - AlignmentUtils.collectAndCountIndels(rtest,new_indels); - new_mismatches += AlignmentUtils.numMismatches(rtest,referenceSequence); - } - } + r.setCigar(cig); + + r.setAttribute("NM",new Integer(AlignmentUtils.numMismatches(r,referenceSequence))); + + if ( r.getAlignmentStart() == 713655 ) { + System.out.println("!!!----> "+r.format()); + System.out.println("!!!----> "+AlignmentUtils.toString(cig) +" --- " +AlignmentUtils.toString(r.getCigar())); + } + // System.out.println("writing " + id); + samWriter.addAlignment(r); - int discovered_indels = 0; - int discovered_support = 0; - int existing_indels = 0; - int existing_support = 0; - int existing_support_new = 0; - int discarded_indels = 0; - for ( CountedObject ind : new_indels ) { - if ( ! all_indels.contains(ind) ) { - discovered_indels++; - discovered_support += ind.getCount(); - } else { - existing_indels++; - existing_support_new += ind.getCount(); - } - } - for ( CountedObject ind : all_indels ) { - if ( ! new_indels.contains(ind )) { - discarded_indels++; - } else { - existing_support += ind.getCount(); } } - System.out.print("TOTAL MISMATCHES: "+totalMismatches +" --> "+new_mismatches); - if ( totalMismatches > new_mismatches ) System.out.print("(-"); - else System.out.print("(+"); - System.out.println(Math.abs((new_mismatches - totalMismatches)*100.0/totalMismatches)+"%)"); - - System.out.println("CONFIRMED INDELS: "+existing_indels); - System.out.print("CONFIRMED INDEL SUPPORT: "+existing_support + " --> " + existing_support_new ); - if ( existing_support > existing_support_new ) System.out.print("(-"); - else System.out.print("(+"); - System.out.println(Math.abs((existing_support- existing_support_new)*100.0/existing_support)+"%)"); - System.out.println("DROPPED INDELS: " + discarded_indels); - System.out.println("DISCOVERED INDELS: " + discovered_indels) ; - System.out.println("DISCOVERED INDELS SUPPORT: "+discovered_support); - System.out.println("ALIGNMENTS SHIFTED: "+shifted_reads); - System.out.println("ALIGNMENTS WITH GAPS CHANGED: "+smashed_reads); } + public void close() { samWriter.close(); } + + public double pct (int i, int t) { + return ((double)i*100.0/((double)t)); + } + + public void printStats() { + System.out.println("\n---------------------------------------------------------------------------------"); + System.out.println("Piles processed: "+ processed_piles); + System.out.printf("Piles improved: %d (%.2f%%)%n", improved_piles,pct(improved_piles,processed_piles)); + System.out.printf("Piles confirmed (unchanged): %d (%.2f%%)%n", unmodified_piles,pct(unmodified_piles,processed_piles)); + System.out.printf("Piles failed: %d (%.2f%%)%n", failed_piles,pct(failed_piles,processed_piles)); + System.out.println("In improved piles:"); + System.out.printf(" Total reads: %d (%.1f per pile) with %.2f mm/read originally%n", total_reads_in_improved, + (double)total_reads_in_improved/(double)improved_piles,(double) total_mismatches_count_in_improved /(double)total_reads_in_improved); + System.out.printf(" Overall mismatch count: %d --> %d (%.2f%%)%n", total_mismatches_count_in_improved,total_improved_mismatches_count, + pct(total_improved_mismatches_count- total_mismatches_count_in_improved, total_mismatches_count_in_improved)); + System.out.printf(" Mismatch improvement: suppressed %.2f mm/read%n", + (double)(total_mismatches_count_in_improved -total_improved_mismatches_count)/(double)total_reads_in_improved ); + System.out.printf(" Alignments modified: %d (%.2f%% of total or %.2f per pile)%n",total_alignments_modified, + pct(total_alignments_modified,total_reads_in_improved),(double)total_alignments_modified/(double)improved_piles); + System.out.printf(" Improved indels: %d (%.2f per pile) with %.3f additional reads per indel%n", + indels_improved,(double)indels_improved/(double)improved_piles,(double)indel_improvement_cnt/(double)indels_improved); + System.out.printf(" New indels: %d (%.2f per pile) with %.3f reads per indel%n", + indels_added,(double)indels_added/(double)improved_piles,(double)indels_added_cnt/(double)indels_added); + System.out.printf(" Discarded indels: %d (%.2f per pile)%n", + indels_discarded,(double)indels_discarded/(double)improved_piles); + System.out.println("In failed piles:"); + System.out.printf(" Total reads: %d (%.1f per pile) with %.2f mm/read originally%n", total_reads_in_failed, + (double)total_reads_in_failed/(double)failed_piles,(double) total_mismatches_count_in_failed /(double)total_reads_in_failed); + System.out.println("---------------------------------------------------------------------------------\n"); + + } + + public void setVerbosity(int v) { + mVerbosityLevel = v; + } /** Assuming that a read of length l has a gapless, fully consumed align starting at s (ZERO-based) to some sequence X, * and that sequence's alignment to some reference Y is described by baseCigar, builds a cigar for the direct * alignment of the read to Y (i.e. if the alignment of X to Y contains indel(s) and the read spans them, the @@ -224,7 +419,7 @@ public class PileBuilder implements RecordPileReceiver { int i = 0; while ( refpos <= s ) { celem = baseCigar.getCigarElement(i); - refpos+=celem.getLength(); + if ( celem.getOperator() != CigarOperator.D ) refpos+=celem.getLength(); i++; } // we now sit on cigar element that contains start s, and refpos points to the end of that element; i points to next element @@ -241,6 +436,24 @@ public class PileBuilder implements RecordPileReceiver { return new Cigar(lce); } + private int indelCorrection(int offset, Cigar cig) { + int correction = 0; + for ( int i = 0 ; i < cig.numCigarElements() && offset > 0 ; i++ ) { + CigarElement ce = cig.getCigarElement(i); + switch ( ce.getOperator() ) { + case M: offset -= ce.getLength() ; break; + case I: + if ( offset >= ce.getLength() ) correction-= ce.getLength(); + else correction -= offset; + offset -= ce.getLength(); + break; + case D: correction+=ce.getLength(); + break; + } + } + return correction; + } + public void initPairwiseAlignments( IndexedSequence [] seqs ) { distances = new SymmetricMatrix( seqs.length ); alignments = new Matrix( seqs.length ); @@ -517,7 +730,7 @@ public class PileBuilder implements RecordPileReceiver { */ public double diameter(MultipleAlignment a) { double dmaxmin = 0.0; - System.out.print("\n["); + if ( mVerbosityLevel >= PILESUMMARY ) System.out.print("\nclosest neighbor for each seq: ["); Iterator ids1 = a.sequenceIdByOffsetIterator(); while ( ids1.hasNext() ) { Integer id1 = ids1.next(); @@ -528,10 +741,10 @@ public class PileBuilder implements RecordPileReceiver { d = Math.min(d,dpair); } // d = distance from id1 to its closest neighbor within the pile - if ( d < 1e99 ) System.out.printf("%8.4g",d); + if ( d < 1e99 && mVerbosityLevel >= PILESUMMARY ) System.out.printf("%8.4g",d); if ( d < 1e99 && d > dmaxmin ) dmaxmin = d; } - System.out.println(" ]"); + if ( mVerbosityLevel >= PILESUMMARY ) System.out.println(" ]"); // dmaxmin = the largest distance from a sequence in this pile to its closest neighbor // System.out.println(); return dmaxmin; @@ -545,7 +758,7 @@ public class PileBuilder implements RecordPileReceiver { // IndexedSequence [] seqs = testSet3(K); // initialize test set data IndexedSequence [] seqs = testSet4(K); // initialize test set data - PileBuilder pb = new PileBuilder(); + PileBuilder pb = new PileBuilder("test1.bam",null,new DiscardingReceiver()); //pb.doMultipleAlignment(seqs); pb.doMultipleAlignment2(seqs); diff --git a/java/src/org/broadinstitute/sting/playground/indels/SWPairwiseAlignment.java b/java/src/org/broadinstitute/sting/playground/indels/SWPairwiseAlignment.java index a148903ce..9dd21e952 100755 --- a/java/src/org/broadinstitute/sting/playground/indels/SWPairwiseAlignment.java +++ b/java/src/org/broadinstitute/sting/playground/indels/SWPairwiseAlignment.java @@ -91,6 +91,8 @@ public class SWPairwiseAlignment { PrimitivePair.Int p = new PrimitivePair.Int(); int maxscore = 0; + int segment_length = 0; // length of the segment (continuous matches, insertions or deletions) + // look for largest score. we use >= combined with the traversal direction // to ensure that if two scores are equal, the one closer to diagonal gets picked for ( int i = 1 ; i < n+1 ; i++ ) { @@ -102,6 +104,7 @@ public class SWPairwiseAlignment { p.first = n; p.second = j ; maxscore = sw[n][j]; + segment_length = m - j; // end of sequence 2 is overhanging; we will just record it as 'M' segment } } @@ -114,7 +117,6 @@ public class SWPairwiseAlignment { // to that sequence int state = MSTATE; - int segment_length = 0; // length of the segment (continuous matches, insertions or deletions) int [] scores = new int[3]; @@ -160,12 +162,12 @@ public class SWPairwiseAlignment { case DSTATE: o = CigarOperator.D; break; } + alignment_offset = p.first - p.second; segment_length+=p.second; CigarElement e = new CigarElement(segment_length,o); lce.add(e); Collections.reverse(lce); alignmentCigar = new Cigar(lce); - alignment_offset = p.first - p.second; } /** Allows for separate gap opening end extension penalties, no direct backtracking. @@ -198,6 +200,8 @@ public class SWPairwiseAlignment { PrimitivePair.Int p = new PrimitivePair.Int(); double maxscore = 0.0; + int segment_length = 0; // length of the segment (continuous matches, insertions or deletions) + // look for largest score. we use >= combined with the traversal direction // to ensure that if two scores are equal, the one closer to diagonal gets picked for ( int i = 1 ; i < n+1 ; i++ ) { @@ -209,6 +213,7 @@ public class SWPairwiseAlignment { p.first = n; p.second = j ; maxscore = sw[n][j]; + segment_length = m - j; // end of sequence 2 is overhanging; we will just record it as 'M' segment } } @@ -221,7 +226,6 @@ public class SWPairwiseAlignment { // to that sequence int state = MSTATE; - int segment_length = 0; // length of the segment (continuous matches, insertions or deletions) double [] scores = new double[3]; @@ -268,12 +272,12 @@ public class SWPairwiseAlignment { case ISTATE: o = CigarOperator.I; break; case DSTATE: o = CigarOperator.D; break; } + alignment_offset = p.first - p.second; segment_length+=p.second; CigarElement e = new CigarElement(segment_length,o); lce.add(e); Collections.reverse(lce); alignmentCigar = new Cigar(lce); - alignment_offset = p.first - p.second ; } @@ -342,6 +346,8 @@ public void align3(String a, String b) { PrimitivePair.Int p = new PrimitivePair.Int(); double maxscore = 0.0; + int segment_length = 0; // length of the segment (continuous matches, insertions or deletions) + // look for largest score. we use >= combined with the traversal direction // to ensure that if two scores are equal, the one closer to diagonal gets picked for ( int i = 1 ; i < n+1 ; i++ ) { @@ -353,6 +359,7 @@ public void align3(String a, String b) { p.first = n; p.second = j ; maxscore = sw[n][j]; + segment_length = m - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment } } @@ -365,7 +372,6 @@ public void align3(String a, String b) { // to that sequence int state = MSTATE; - int segment_length = 0; // length of the segment (continuous matches, insertions or deletions) double [] scores = new double[3]; @@ -419,12 +425,12 @@ public void align3(String a, String b) { case ISTATE: o = CigarOperator.I; break; case DSTATE: o = CigarOperator.D; break; } + alignment_offset = p.first - p.second; segment_length+=p.second; CigarElement e = new CigarElement(segment_length,o); lce.add(e); Collections.reverse(lce); alignmentCigar = new Cigar(lce); - alignment_offset = p.first - p.second; } public void align4(String a, String b) { @@ -485,6 +491,8 @@ public void align3(String a, String b) { PrimitivePair.Int p = new PrimitivePair.Int(); double maxscore = 0.0; + int segment_length = 0; // length of the segment (continuous matches, insertions or deletions) + // look for largest score. we use >= combined with the traversal direction // to ensure that if two scores are equal, the one closer to diagonal gets picked for ( int i = 1 ; i < n+1 ; i++ ) { @@ -496,6 +504,7 @@ public void align3(String a, String b) { p.first = n; p.second = j ; maxscore = sw[n][j]; + segment_length = m - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment } } @@ -508,7 +517,6 @@ public void align3(String a, String b) { // to that sequence int state = MSTATE; - int segment_length = 0; // length of the segment (continuous matches, insertions or deletions) double [] scores = new double[3]; @@ -558,12 +566,12 @@ public void align3(String a, String b) { case ISTATE: o = CigarOperator.I; break; case DSTATE: o = CigarOperator.D; break; } + alignment_offset = p.first - p.second; segment_length+=p.second; CigarElement e = new CigarElement(segment_length,o); lce.add(e); Collections.reverse(lce); alignmentCigar = new Cigar(lce); - alignment_offset = p.first - p.second; } private int w(char x, char y) { @@ -688,17 +696,21 @@ public void align3(String a, String b) { } // now pos1 = alignment_offset } - System.out.println(AlignmentUtils.toString(getCigar())); - System.out.println("seq1l="+s1.length()+"; seq2l=" + s2.length()); - System.out.println("offset="+alignment_offset); - - try { - System.out.println("pos1="+pos1+"; pos2=" + pos2); +/* debug prints: */ +// System.out.println(AlignmentUtils.toString(getCigar())); +// System.out.println("seq1l="+s1.length()+"; seq2l=" + s2.length()); +// System.out.println("offset="+alignment_offset); +// System.out.println("pos1="+pos1+"; pos2=" + pos2); +/**/ for ( int i = 0 ; i < getCigar().numCigarElements() ; i++ ) { CigarElement ce = getCigar().getCigarElement(i) ; switch( ce.getOperator() ) { case M: - for ( int k = 0 ; k < ce.getLength() ; k++ ) { + int z = ( i == 0 ? pos2 : 0); // if we are in the first element and seq overhangs to the left, + // start inside the first segment, at the position where actual matches begin + // check separately for pos1 < s1.length() since seq2 is allowed to overhang beyond seq1's end + for ( ; z < ce.getLength() && pos1 < s1.length() ; z++ ) { +// System.out.println("pos1="+pos1+"; pos2="+pos2+"; k="+z); if ( Character.toUpperCase(s1.charAt(pos1)) != Character.toUpperCase(s2.charAt(pos2)) ) bmm.append('*'); else bmm.append(' '); @@ -722,10 +734,7 @@ public void align3(String a, String b) { break; } } - } catch(Exception e) {} - b1.append("<---"); - b2.append("<---"); - bmm.append("<---"); + bmm.append('\n'); b1.append(s1,pos1,s1.length()); bmm.append(b1); @@ -733,6 +742,7 @@ public void align3(String a, String b) { b2.append(s2,pos2,s2.length()); bmm.append(b2); bmm.append('\n'); + return bmm.toString(); } diff --git a/java/src/org/broadinstitute/sting/playground/indels/SequencePile.java b/java/src/org/broadinstitute/sting/playground/indels/SequencePile.java index a5984dbe8..cf0c9b320 100755 --- a/java/src/org/broadinstitute/sting/playground/indels/SequencePile.java +++ b/java/src/org/broadinstitute/sting/playground/indels/SequencePile.java @@ -231,7 +231,7 @@ public class SequencePile { if ( refC == '+' ) { // count number of observations for insertion for ( int j = 0 ; j < col.size() ; j++ ) { - if ( col.charAt(j) != '*' ) count++; + if ( col.charAt(j) != '*' && col.charAt(j) != ' ') count++; } } else { if ( headerGrid.charAt(i) == '-' ) {