massive changes everywhere; lots of bugs fixed; methods moved around; computation and printout of overall stats added; now decides whether to accept or reject 'improvement'; writes alignments into two output sam files (unmodified reads/failed piles into one, realigned piles into the other); special treat for paranoids: writes third sam file with all the analyzed reads, unmodified

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@197 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
asivache 2009-03-26 02:26:17 +00:00
parent 0331cd8e95
commit f47a214f96
9 changed files with 429 additions and 119 deletions

View File

@ -113,8 +113,14 @@ public class AlignmentUtils {
Indel curr_indel = null;
switch(ce.getOperator()) {
case I: curr_indel = new Indel(runninglength, ce.getLength(), Indel.IndelType.I); break;
case I:
curr_indel = new Indel(runninglength, ce.getLength(), Indel.IndelType.I);
if ( i == 0 ) System.out.println("WARNING: Indel at start!");
if ( i == c.numCigarElements() - 1) System.out.println("WARNING: Indel at end!");
break;
case D: curr_indel = new Indel(runninglength, ce.getLength(), Indel.IndelType.D);
if ( i == 0 ) System.out.println("WARNING: Indel at start!");
if ( i == c.numCigarElements() - 1) System.out.println("WARNING: Indel at end!");
runninglength += ce.getLength();
break;
case M: runninglength += ce.getLength(); break; // advance along the gapless block in the alignment
@ -152,6 +158,7 @@ public class AlignmentUtils {
for ( Indel ind : indels ) {
CountedObject<Indel> ci = new CountedObject<Indel>(ind);
CountedObject<Indel> found = t.floor(ci);
// CountedObject<Indel> found2 = t.ceiling(ci);
if ( ci.equals( found ) ) found.increment(); // we did find our indel, advance the counter
else t.add(ci); // this is a new indel. Add it.

View File

@ -114,7 +114,7 @@ public class Indel implements Interval {
* @param i Another interval
* @return true iff intervals overlap
*/
@Override
public boolean overlapsP(Interval i) {
return ! disjointP(i); //To change body of implemented methods use File | Settings | File Templates.
}
@ -126,7 +126,6 @@ public class Indel implements Interval {
* @param i Another interval
* @return true iff intervals do not overlap
*/
@Override
public boolean disjointP(Interval i) {
return i.getStop() < this.getStart() || i.getStart() > this.getStop();
}
@ -135,7 +134,6 @@ public class Indel implements Interval {
* has length of 0.
* @return length of the event on the original, unmodified reference
*/
@Override
public long getLength() {
if ( mType == IndelType.I ) return 0;
return mLength;
@ -150,6 +148,6 @@ public class Indel implements Interval {
@Override
public int hashCode() {
return (int)( mStart << 2 + mLength );
return (int)( mStart << 6 + mStart + mLength );
}
}

View File

@ -11,8 +11,6 @@ import javax.swing.filechooser.FileNameExtensionFilter;
import edu.mit.broad.picard.cmdline.CommandLineProgram;
import edu.mit.broad.picard.cmdline.Option;
import edu.mit.broad.picard.cmdline.Usage;
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory;
import edu.mit.broad.picard.reference.ReferenceSequenceFileWalker;
import edu.mit.broad.picard.reference.ReferenceSequence;
@ -25,6 +23,10 @@ public class IndelInspector extends CommandLineProgram {
@Usage(programVersion="1.0") public String USAGE = "Investigates indels called in the alignment data\n";
@Option(shortName="I", doc="SAM or BAM file for calling",optional=true) public File INPUT_FILE;
@Option(shortName="L",doc="Genomic interval to run on, as contig[:start[-stop]]; whole genome if not specified", optional=true) public String GENOME_LOCATION;
@Option(shortName="V",doc="Verbosity level: SILENT, PILESUMMARY, ALIGNMENTS", optional=true) public String VERBOSITY_LEVEL;
@Option(doc="Output file (sam or bam) for non-indel related reads and indel reads that were not improved") public String OUT1;
@Option(doc="Output file (sam or bam) for improved (realigned) indel related reads") public String OUT2;
@Option(doc="[paranoid] Output \"control\" file (sam or bam): all reads picked and processed by this tool will be also saved, unmodified, into this file", optional=true) public String OUTC;
@Option(doc="Error counting mode: MM - count mismatches only, ERR - count errors (arachne style), MG - count mismatches and gaps as one error each") public String ERR_MODE;
@Option(doc="Maximum number of errors allowed (see ERR_MODE)") public Integer MAX_ERRS;
// @Option(shortName="R", doc="Reference fasta or fasta.gz file") public File REF_FILE;
@ -46,7 +48,7 @@ public class IndelInspector extends CommandLineProgram {
System.out.println("Unknown value specified for ERR_MODE");
return 1;
}
final SAMFileReader samReader = new SAMFileReader(getInputFile(INPUT_FILE,"/broad/1KG/"));
samReader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT);
@ -60,12 +62,26 @@ public class IndelInspector extends CommandLineProgram {
ReferenceSequence contig_seq = null;
IndelRecordPileCollector col = null;
PileBuilder pileBuilder = new PileBuilder();
PassThroughWriter ptWriter = new PassThroughWriter(OUT1,samReader.getFileHeader());
PileBuilder pileBuilder = new PileBuilder(OUT2,samReader.getFileHeader(),ptWriter);
SAMFileWriter controlWriter = null;
if ( OUTC != null ) controlWriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(samReader.getFileHeader(),false,new File(OUTC));
try {
col = new IndelRecordPileCollector(new DiscardingReceiver(), pileBuilder );
col = new IndelRecordPileCollector(ptWriter, pileBuilder, controlWriter );
} catch(Exception e) { System.err.println(e.getMessage()); }
if ( col == null ) return 1;
if ( VERBOSITY_LEVEL == null ) VERBOSITY_LEVEL = new String("SILENT");
if ( VERBOSITY_LEVEL.toUpperCase().equals("SILENT")) pileBuilder.setVerbosity(pileBuilder.SILENT);
else if ( VERBOSITY_LEVEL.toUpperCase().equals("PILESUMMARY") ) pileBuilder.setVerbosity(pileBuilder.PILESUMMARY);
else if ( VERBOSITY_LEVEL.toUpperCase().equals("ALIGNMENTS") ) pileBuilder.setVerbosity(pileBuilder.ALIGNMENTS);
else {
System.out.println("Unrecognized VERBOSITY_LEVEL setting.");
return 1;
}
String cur_contig = null;
int counter = 0;
@ -120,9 +136,14 @@ public class IndelInspector extends CommandLineProgram {
col.receive(r);
}
pileBuilder.printStats();
System.out.println("done.");
col.printLengthHistograms();
samReader.close();
pileBuilder.close();
ptWriter.close();
if ( controlWriter != null ) controlWriter.close();
return 0;
}

View File

@ -89,6 +89,8 @@ public class IndelRecordPileCollector implements RecordReceiver {
private RecordReceiver defaultReceiver; // we will send there records that do not overlap with regions of interest
private RecordPileReceiver indelPileReceiver; // piles over indel regions will be sent there
private SAMFileWriter controlWriter;
private String referenceSequence;
public String memStatsString() {
@ -97,7 +99,12 @@ public class IndelRecordPileCollector implements RecordReceiver {
//+" Bndries="+mIndelRegionStart +":"+ mIndelRegionStop;
}
public IndelRecordPileCollector(RecordReceiver rr, RecordPileReceiver rp) throws java.io.IOException {
public IndelRecordPileCollector(RecordReceiver rr, RecordPileReceiver rp) throws java.io.IOException {
this(rr,rp,null);
}
public IndelRecordPileCollector(RecordReceiver rr, RecordPileReceiver rp, SAMFileWriter cw) throws java.io.IOException {
mRecordPile = new LinkedList<SAMRecord>();
mAllIndels = new TreeSet<CountedObject<Indel> >(
new CountedObjectComparatorAdapter<Indel>(new IntervalComparator()));
@ -113,6 +120,7 @@ public class IndelRecordPileCollector implements RecordReceiver {
defaultReceiver = rr;
indelPileReceiver = rp;
referenceSequence = null;
controlWriter = cw;
setWaitState();
}
@ -146,6 +154,7 @@ public class IndelRecordPileCollector implements RecordReceiver {
SAMRecord r = i.next();
if ( r.getAlignmentEnd() <= pos ) {
defaultReceiver.receive(r);
if ( controlWriter != null ) controlWriter.addAlignment(r);
i.remove();
} else break;
}
@ -161,6 +170,7 @@ public class IndelRecordPileCollector implements RecordReceiver {
SAMRecord r = i.next();
if ( r.getAlignmentStart() >= pos ) {
defaultReceiver.receive(r);
if ( controlWriter != null ) controlWriter.addAlignment(r);
i.remove();
} else break;
}
@ -250,7 +260,10 @@ public class IndelRecordPileCollector implements RecordReceiver {
}
// no indels or avoiding indels in bad region: send all records to defaultReceiver and clear the pile
for ( SAMRecord r : mRecordPile ) defaultReceiver.receive(r);
for ( SAMRecord r : mRecordPile ) {
defaultReceiver.receive(r);
if ( controlWriter != null ) controlWriter.addAlignment(r);
}
setWaitState();
return;
}
@ -325,7 +338,13 @@ public class IndelRecordPileCollector implements RecordReceiver {
System.out.print(finalPile.size() + " reads in the pile;") ;
System.out.println(formatRange(finalTrain));
indelPileReceiver.receive(finalPile);
} else for ( SAMRecord r : finalPile ) defaultReceiver.receive(r);
if ( controlWriter != null ) for ( SAMRecord r : finalPile ) controlWriter.addAlignment(r);
} else {
for ( SAMRecord r : finalPile ) {
defaultReceiver.receive(r);
controlWriter.addAlignment(r);
}
}
finalPile.clear();
finalTrain.clear();
curr_stop = -1;

View File

@ -240,7 +240,7 @@ public class MultipleAlignment implements Iterable<Integer> {
/** Returns a (multiline) string that represents the alignment visually: the sequences are appropriately
* shifted and ready for printout;
*/
public String toString(boolean inorder) {
public String toString(boolean inorder, boolean dotprint) {
StringBuilder b = new StringBuilder();
java.util.Formatter frmt = new java.util.Formatter(b);
@ -278,8 +278,16 @@ public class MultipleAlignment implements Iterable<Integer> {
for ( int i = 0 ; i < seqs.size() ; i++ ) {
int index = (inorder ? perm[i] : i);
frmt.format("%3d:", ext_ids.get(index));
skipN(alignment_offsets.get(index)+ first_offset,b);
b.append(seqs.get(index));
int pos = alignment_offsets.get(index)+ first_offset; // start position on the consensus sequence
skipN(pos,b);
String aSeq = seqs.get(index);
if ( dotprint ) {
for ( int j = 0 ; j < aSeq.length() ; j++, pos++ ) {
if ( Character.toUpperCase(aSeq.charAt(j)) ==
Character.toUpperCase(consensusString[3][pos]) ) b.append('.');
else b.append(aSeq.charAt(j));
}
} else b.append(aSeq);
b.append('\n');
}
// b.append(best_mm+" mismatches, "+ next_mm + " next best, " + getOverlap() + " overlapping bases, distance=" + distance() + "\n");
@ -290,7 +298,7 @@ public class MultipleAlignment implements Iterable<Integer> {
return consensus.getSequence();
}
public String toString() { return toString(true); }
public String toString() { return toString(true, false); }
public int size() { return seqs.size(); }

View File

@ -0,0 +1,34 @@
package org.broadinstitute.sting.playground.indels;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMFileWriterFactory;
import net.sf.samtools.SAMFileHeader;
import java.io.File;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Mar 25, 2009
* Time: 8:27:09 PM
* To change this template use File | Settings | File Templates.
*/
public class PassThroughWriter implements RecordReceiver {
private SAMFileWriter writer;
public PassThroughWriter( File f, SAMFileHeader h) {
writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(h, false, f);
}
public PassThroughWriter(String s, SAMFileHeader h) {
this(new File(s), h);
}
public void receive(SAMRecord r) {
//To change body of implemented methods use File | Settings | File Templates.
writer.addAlignment(r);
}
public void close() { writer.close() ; }
}

View File

@ -3,6 +3,8 @@ package org.broadinstitute.sting.playground.indels;
import net.sf.samtools.*;
import java.util.*;
import java.io.File;
import org.broadinstitute.sting.utils.PrimitivePair;
import org.broadinstitute.sting.playground.utils.CountedObject;
import org.broadinstitute.sting.playground.utils.CountedObjectComparatorAdapter;
@ -17,6 +19,30 @@ public class PileBuilder implements RecordPileReceiver {
private String referenceSequence;
private int reference_start;
private int processed_piles = 0;
private int improved_piles = 0;
private int unmodified_piles = 0;
private int failed_piles = 0;
private int indels_improved = 0;
private int indel_improvement_cnt = 0;
private int indels_discarded = 0;
private int indels_added = 0;
private int indels_added_cnt = 0;
private int total_mismatches_count_in_improved = 0;
private int total_mismatches_count_in_failed = 0;
private int total_improved_mismatches_count = 0;
private int total_reads_in_improved = 0;
private int total_reads_in_failed = 0;
private int total_alignments_modified = 0;
public final static int SILENT = 0;
public final static int PILESUMMARY = 1;
public final static int ALIGNMENTS = 2;
private int mVerbosityLevel = SILENT;
private SAMFileWriter samWriter;
private RecordReceiver failedPileReceiver;
private static class SelectedPair {
private int i_;
@ -68,9 +94,15 @@ public class PileBuilder implements RecordPileReceiver {
}
public PileBuilder() {
public PileBuilder(File f, SAMFileHeader h, RecordReceiver fr) {
samWriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(h,false,f);
referenceSequence = null;
reference_start = -1;
failedPileReceiver = fr;
}
public PileBuilder(String s, SAMFileHeader h, RecordReceiver fr) {
this(new File(s),h, fr);
}
public void setReferenceSequence(String seq, int start) {
@ -84,6 +116,10 @@ public class PileBuilder implements RecordPileReceiver {
}
public void receive(Collection<SAMRecord> c) {
//TODO: if read starts/ends with an indel (insertion, actually), we detect this as a "different" indel introduced during cleanup.
processed_piles++;
IndexedSequence[] seqs = new IndexedSequence[c.size()];
int i = 0;
int startOnRef = 1000000000; // absolute start (leftmost) position of the pile of reads on the ref
@ -101,18 +137,24 @@ public class PileBuilder implements RecordPileReceiver {
TreeSet< CountedObject<Indel> > all_indels = new TreeSet< CountedObject<Indel> >(
new CountedObjectComparatorAdapter<Indel>(new IntervalComparator()));
SequencePile originalAligns = new SequencePile(pileRef);
SequencePile originalAligns = null;
if ( mVerbosityLevel >= ALIGNMENTS ) originalAligns = new SequencePile(pileRef);
for ( SAMRecord r : c ) {
originalAligns.addAlignedSequence(r.getReadString(), r.getReadNegativeStrandFlag(),
r.getCigar(), r.getAlignmentStart() - startOnRef );
if ( mVerbosityLevel >= ALIGNMENTS ) {
originalAligns.addAlignedSequence(r.getReadString(), r.getReadNegativeStrandFlag(),
r.getCigar(), r.getAlignmentStart() - startOnRef );
}
totalMismatches += AlignmentUtils.numMismatches(r,referenceSequence);
AlignmentUtils.collectAndCountIndels(r,all_indels);
}
System.out.println("\n#############################################################################");
System.out.println("ORIGINAL ALIGNMENT: \n");
originalAligns.dotprint(true);
System.out.println("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") ;
if ( mVerbosityLevel >= ALIGNMENTS ) {
System.out.println("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++");
System.out.println("ORIGINAL ALIGNMENT: \n");
originalAligns.dotprint(true);
System.out.println("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") ;
}
List<MultipleAlignment> piles = doMultipleAlignment2(seqs);
@ -120,91 +162,244 @@ public class PileBuilder implements RecordPileReceiver {
// System.out.print("; diameter of PILE1: "+ diameter(alignments1));
// System.out.println("; diameter of PILE2: "+ diameter(alignments2));
SymmetricMatrix d = new SymmetricMatrix(piles.size());
for ( int n = 0 ; n < piles.size() ; n++ ) {
d.set(n,n,diameter(piles.get(n)));
for ( int m = n+1 ; m < piles.size() ; m++ ) {
d.set(n,m,distance(piles.get(n), piles.get(m)));
SymmetricMatrix d = new SymmetricMatrix(piles.size());
for ( int n = 0 ; n < piles.size() ; n++ ) {
d.set(n,n,diameter(piles.get(n)));
for ( int m = n+1 ; m < piles.size() ; m++ ) {
d.set(n,m,distance(piles.get(n), piles.get(m)));
}
}
}
int new_mismatches = 0 ; // number of mismatches after re-alignment:
TreeSet< CountedObject<Indel> > new_indels = new TreeSet< CountedObject<Indel> >(
new CountedObjectComparatorAdapter<Indel>(new IntervalComparator())
); // new indels after realignment
int shifted_reads = 0;
int smashed_reads = 0;
int new_mismatches = 0 ; // number of mismatches after re-alignment:
TreeSet< CountedObject<Indel> > new_indels = new TreeSet< CountedObject<Indel> >(
new CountedObjectComparatorAdapter<Indel>(new IntervalComparator())
); // new indels after realignment
int shifted_reads = 0;
int smashed_reads = 0;
List<SAMRecord> as_list = (List<SAMRecord>)c; // ugly hack; need this to access records by ids
List<SAMRecord> as_list = (List<SAMRecord>)c; // ugly hack; need this to access records by ids
System.out.println(d.format("%8.4g"));
for ( int n = 0 ; n < piles.size() ; n++ ) {
if ( mVerbosityLevel >= PILESUMMARY ) System.out.println(d.format("%8.4g"));
for ( int n = 0 ; n < piles.size() ; n++ ) {
// SWPairwiseAlignment consToRef = new SWPairwiseAlignment(pileRef,piles.get(n).getConsensus(),2.0,-10.0,-2.0,-1.0);
SWPairwiseAlignment consToRef = new SWPairwiseAlignment(pileRef,piles.get(n).getConsensus(),3.0,-1.0,-4.0,-0.5);
if ( mVerbosityLevel >= ALIGNMENTS ) {
System.out.println("PILE " + n + " to REF ("+ (consToRef.getCigar().numCigarElements()-1)/2 +" indels):");
System.out.println(consToRef.toString());
System.out.println("PILE " + n +" (READS):\n" +piles.get(n).toString(true,true));
}
// SequencePile pileAligns = new SequencePile(pileRef);
MultipleAlignment ma = piles.get(n);
for ( Integer id : ma ) {
SAMRecord r = as_list.get(id);
int cons_offset = ma.getOffsetWrtConsensus(id); // offset of the read 'id' wrt multiple alignment's full consensus seq
/*
System.out.println("id=" + id +": offset on consensus="+cons_offset+
"; consensus wrt ref chunk="+consToRef.getAlignmentStart2wrt1()+"; chunk start="+startOnRef);
*/
int ref_offset = cons_offset + startOnRef + consToRef.getAlignmentStart2wrt1()+indelCorrection(cons_offset,consToRef.getCigar());
if ( ref_offset != r.getAlignmentStart()) shifted_reads++;
Cigar cig = buildCigar(cons_offset, r.getReadLength(), consToRef.getCigar());
/*
if ( id == 9 ) {
System.out.println("ref_offset="+ref_offset+"; orig_ref_off="+r.getAlignmentStart()+"; "+
AlignmentUtils.toString(cig));
}
System.out.println("adding "+id+" at "+ (ref_offset - refStarttemp));
pileAligns.addAlignedSequence(r.getReadString(), r.getReadNegativeStrandFlag(), cig, ref_offset - refStarttemp);
*/
if ( cig.numCigarElements() != r.getCigar().numCigarElements() ) smashed_reads++;
if ( ref_offset != r.getAlignmentStart() || cig.numCigarElements() != r.getCigar().numCigarElements() ) total_alignments_modified++;
SAMRecord rtest = new SAMRecord(r.getHeader());
rtest.setAlignmentStart(ref_offset);
rtest.setReadString(r.getReadString());
rtest.setReadUmappedFlag(r.getReadUnmappedFlag());
rtest.setCigar(cig);
AlignmentUtils.collectAndCountIndels(rtest,new_indels);
new_mismatches += AlignmentUtils.numMismatches(rtest,referenceSequence);
}
// pileAligns.colorprint(true);
}
boolean pile_improved = false;
boolean pile_unmodified = false;
boolean pile_failed = false;
double mmChangePct = Math.abs((new_mismatches - totalMismatches)*100.0/totalMismatches);
if ( shifted_reads == 0 && smashed_reads == 0 ) pile_unmodified = true;
else {
if ( new_mismatches < totalMismatches ||
mmChangePct < 10.0 && ( new_indels.size() < all_indels.size() )
) pile_improved = true;
else pile_failed = true;
}
if ( pile_improved ) {
total_mismatches_count_in_improved +=totalMismatches;
total_improved_mismatches_count += new_mismatches;
total_reads_in_improved += c.size() ;
}
if ( pile_failed ) {
total_mismatches_count_in_failed += totalMismatches;
total_reads_in_failed += c.size();
}
int discovered_indels = 0;
int discovered_support = 0;
int existing_indels = 0;
int existing_support = 0;
int existing_support_new = 0;
int discarded_indels = 0;
for ( CountedObject<Indel> ind : new_indels ) {
//System.out.print("new indel: "+ind.getObject().getStart()+"+"+ind.getObject().getStop());
if ( ! all_indels.contains(ind) ) {
//System.out.println(" (DISCOVERED)");
discovered_indels++;
discovered_support += ind.getCount();
if ( pile_improved ) {
indels_added++;
indels_added_cnt += ind.getCount();
}
} else {
//System.out.println(" (EXISTING)");
existing_indels++;
existing_support_new += ind.getCount();
if ( pile_improved && ( ind.getCount() > all_indels.floor(ind).getCount() ) ) {
if ( ! ind.equals(all_indels.floor(ind))) System.out.println("ERROR MATCHING INDELS!!!") ;
indels_improved++;
indel_improvement_cnt += ( ind.getCount() - all_indels.floor(ind).getCount() );
}
}
}
for ( CountedObject<Indel> ind : all_indels ) {
//System.out.print("old indel: "+ind.getObject().getStart()+"+"+ind.getObject().getStop());
if ( ! new_indels.contains(ind )) {
//System.out.println(" (DISCARDED)");
discarded_indels++;
if ( pile_improved ) indels_discarded++;
} else {
//System.out.println(" (KEPT)");
existing_support += ind.getCount();
}
}
if ( pile_improved ) improved_piles++;
if ( pile_unmodified ) unmodified_piles++;
if ( pile_failed ) failed_piles++;
if ( mVerbosityLevel >= PILESUMMARY ) {
System.out.print("TOTAL MISMATCHES: "+totalMismatches +" --> "+new_mismatches);
if ( totalMismatches > new_mismatches ) System.out.print("(-");
else System.out.print("(+");
System.out.printf("%.2f%%)%n",mmChangePct);
System.out.println("CONFIRMED INDELS: "+existing_indels);
System.out.print("CONFIRMED INDEL SUPPORT: "+existing_support + " --> " + existing_support_new );
if ( existing_support > existing_support_new ) System.out.print("(-");
else System.out.print("(+");
System.out.printf("%.2f%%)%n",Math.abs((existing_support- existing_support_new)*100.0/existing_support));
System.out.println("DROPPED INDELS: " + discarded_indels);
System.out.println("DISCOVERED INDELS: " + discovered_indels) ;
System.out.println("DISCOVERED INDELS SUPPORT: "+discovered_support);
System.out.println("ALIGNMENTS SHIFTED: "+shifted_reads);
System.out.println("ALIGNMENTS WITH GAPS CHANGED: "+smashed_reads);
if ( pile_improved ) System.out.println("OUTCOME: IMPROVED");
if ( pile_unmodified ) System.out.println("OUTCOME: UNCHANGED");
if ( pile_failed ) System.out.println("OUTCOME: FAILED");
System.out.println("\n#############################################################################\n");
}
// finally, writing stuff:
for ( int n = 0 ; n < piles.size() ; n++ ) {
SWPairwiseAlignment consToRef = new SWPairwiseAlignment(pileRef,piles.get(n).getConsensus(),3.0,-1.0,-4.0,-0.5);
System.out.println("PILE " + n + " to REF ("+ (consToRef.getCigar().numCigarElements()-1)/2 +" indels):");
System.out.println(consToRef.toString());
System.out.println("PILE " + n +" (READS):\n" +piles.get(n).toString());
MultipleAlignment ma = piles.get(n);
for ( Integer id : ma ) {
Iterator<Integer> id_iter = ma.sequenceIdByOffsetIterator();
while ( id_iter.hasNext() ) {
int id = id_iter.next();
SAMRecord r = as_list.get(id);
if ( pile_failed || pile_unmodified ) {
failedPileReceiver.receive(r); // nothing to do, send failed piles directly for writing
continue;
}
// we improved stuff!! let's reset the alignment parameters!
int cons_offset = ma.getOffsetWrtConsensus(id); // offset of the read 'id' wrt multiple alignment's full consensus seq
int ref_offset = cons_offset + startOnRef + consToRef.getAlignmentStart2wrt1();
if ( ref_offset != r.getAlignmentStart()) shifted_reads++;
// offset of the realigned read r on the reference
int ref_offset = cons_offset + startOnRef + consToRef.getAlignmentStart2wrt1()+indelCorrection(cons_offset,consToRef.getCigar());
r.setAlignmentStart(ref_offset);
Cigar cig = buildCigar(cons_offset, r.getReadLength(), consToRef.getCigar());
if ( cig.numCigarElements() != r.getCigar().numCigarElements() ) smashed_reads++;
SAMRecord rtest = new SAMRecord(r.getHeader());
rtest.setAlignmentStart(ref_offset);
rtest.setReadString(r.getReadString());
rtest.setReadUmappedFlag(r.getReadUnmappedFlag());
rtest.setCigar(cig);
AlignmentUtils.collectAndCountIndels(rtest,new_indels);
new_mismatches += AlignmentUtils.numMismatches(rtest,referenceSequence);
}
}
r.setCigar(cig);
r.setAttribute("NM",new Integer(AlignmentUtils.numMismatches(r,referenceSequence)));
if ( r.getAlignmentStart() == 713655 ) {
System.out.println("!!!----> "+r.format());
System.out.println("!!!----> "+AlignmentUtils.toString(cig) +" --- " +AlignmentUtils.toString(r.getCigar()));
}
// System.out.println("writing " + id);
samWriter.addAlignment(r);
int discovered_indels = 0;
int discovered_support = 0;
int existing_indels = 0;
int existing_support = 0;
int existing_support_new = 0;
int discarded_indels = 0;
for ( CountedObject<Indel> ind : new_indels ) {
if ( ! all_indels.contains(ind) ) {
discovered_indels++;
discovered_support += ind.getCount();
} else {
existing_indels++;
existing_support_new += ind.getCount();
}
}
for ( CountedObject<Indel> ind : all_indels ) {
if ( ! new_indels.contains(ind )) {
discarded_indels++;
} else {
existing_support += ind.getCount();
}
}
System.out.print("TOTAL MISMATCHES: "+totalMismatches +" --> "+new_mismatches);
if ( totalMismatches > new_mismatches ) System.out.print("(-");
else System.out.print("(+");
System.out.println(Math.abs((new_mismatches - totalMismatches)*100.0/totalMismatches)+"%)");
System.out.println("CONFIRMED INDELS: "+existing_indels);
System.out.print("CONFIRMED INDEL SUPPORT: "+existing_support + " --> " + existing_support_new );
if ( existing_support > existing_support_new ) System.out.print("(-");
else System.out.print("(+");
System.out.println(Math.abs((existing_support- existing_support_new)*100.0/existing_support)+"%)");
System.out.println("DROPPED INDELS: " + discarded_indels);
System.out.println("DISCOVERED INDELS: " + discovered_indels) ;
System.out.println("DISCOVERED INDELS SUPPORT: "+discovered_support);
System.out.println("ALIGNMENTS SHIFTED: "+shifted_reads);
System.out.println("ALIGNMENTS WITH GAPS CHANGED: "+smashed_reads);
}
public void close() { samWriter.close(); }
public double pct (int i, int t) {
return ((double)i*100.0/((double)t));
}
public void printStats() {
System.out.println("\n---------------------------------------------------------------------------------");
System.out.println("Piles processed: "+ processed_piles);
System.out.printf("Piles improved: %d (%.2f%%)%n", improved_piles,pct(improved_piles,processed_piles));
System.out.printf("Piles confirmed (unchanged): %d (%.2f%%)%n", unmodified_piles,pct(unmodified_piles,processed_piles));
System.out.printf("Piles failed: %d (%.2f%%)%n", failed_piles,pct(failed_piles,processed_piles));
System.out.println("In improved piles:");
System.out.printf(" Total reads: %d (%.1f per pile) with %.2f mm/read originally%n", total_reads_in_improved,
(double)total_reads_in_improved/(double)improved_piles,(double) total_mismatches_count_in_improved /(double)total_reads_in_improved);
System.out.printf(" Overall mismatch count: %d --> %d (%.2f%%)%n", total_mismatches_count_in_improved,total_improved_mismatches_count,
pct(total_improved_mismatches_count- total_mismatches_count_in_improved, total_mismatches_count_in_improved));
System.out.printf(" Mismatch improvement: suppressed %.2f mm/read%n",
(double)(total_mismatches_count_in_improved -total_improved_mismatches_count)/(double)total_reads_in_improved );
System.out.printf(" Alignments modified: %d (%.2f%% of total or %.2f per pile)%n",total_alignments_modified,
pct(total_alignments_modified,total_reads_in_improved),(double)total_alignments_modified/(double)improved_piles);
System.out.printf(" Improved indels: %d (%.2f per pile) with %.3f additional reads per indel%n",
indels_improved,(double)indels_improved/(double)improved_piles,(double)indel_improvement_cnt/(double)indels_improved);
System.out.printf(" New indels: %d (%.2f per pile) with %.3f reads per indel%n",
indels_added,(double)indels_added/(double)improved_piles,(double)indels_added_cnt/(double)indels_added);
System.out.printf(" Discarded indels: %d (%.2f per pile)%n",
indels_discarded,(double)indels_discarded/(double)improved_piles);
System.out.println("In failed piles:");
System.out.printf(" Total reads: %d (%.1f per pile) with %.2f mm/read originally%n", total_reads_in_failed,
(double)total_reads_in_failed/(double)failed_piles,(double) total_mismatches_count_in_failed /(double)total_reads_in_failed);
System.out.println("---------------------------------------------------------------------------------\n");
}
public void setVerbosity(int v) {
mVerbosityLevel = v;
}
/** Assuming that a read of length l has a gapless, fully consumed align starting at s (ZERO-based) to some sequence X,
* and that sequence's alignment to some reference Y is described by baseCigar, builds a cigar for the direct
* alignment of the read to Y (i.e. if the alignment of X to Y contains indel(s) and the read spans them, the
@ -224,7 +419,7 @@ public class PileBuilder implements RecordPileReceiver {
int i = 0;
while ( refpos <= s ) {
celem = baseCigar.getCigarElement(i);
refpos+=celem.getLength();
if ( celem.getOperator() != CigarOperator.D ) refpos+=celem.getLength();
i++;
}
// we now sit on cigar element that contains start s, and refpos points to the end of that element; i points to next element
@ -241,6 +436,24 @@ public class PileBuilder implements RecordPileReceiver {
return new Cigar(lce);
}
private int indelCorrection(int offset, Cigar cig) {
int correction = 0;
for ( int i = 0 ; i < cig.numCigarElements() && offset > 0 ; i++ ) {
CigarElement ce = cig.getCigarElement(i);
switch ( ce.getOperator() ) {
case M: offset -= ce.getLength() ; break;
case I:
if ( offset >= ce.getLength() ) correction-= ce.getLength();
else correction -= offset;
offset -= ce.getLength();
break;
case D: correction+=ce.getLength();
break;
}
}
return correction;
}
public void initPairwiseAlignments( IndexedSequence [] seqs ) {
distances = new SymmetricMatrix( seqs.length );
alignments = new Matrix<PairwiseAlignment>( seqs.length );
@ -517,7 +730,7 @@ public class PileBuilder implements RecordPileReceiver {
*/
public double diameter(MultipleAlignment a) {
double dmaxmin = 0.0;
System.out.print("\n[");
if ( mVerbosityLevel >= PILESUMMARY ) System.out.print("\nclosest neighbor for each seq: [");
Iterator<Integer> ids1 = a.sequenceIdByOffsetIterator();
while ( ids1.hasNext() ) {
Integer id1 = ids1.next();
@ -528,10 +741,10 @@ public class PileBuilder implements RecordPileReceiver {
d = Math.min(d,dpair);
}
// d = distance from id1 to its closest neighbor within the pile
if ( d < 1e99 ) System.out.printf("%8.4g",d);
if ( d < 1e99 && mVerbosityLevel >= PILESUMMARY ) System.out.printf("%8.4g",d);
if ( d < 1e99 && d > dmaxmin ) dmaxmin = d;
}
System.out.println(" ]");
if ( mVerbosityLevel >= PILESUMMARY ) System.out.println(" ]");
// dmaxmin = the largest distance from a sequence in this pile to its closest neighbor
// System.out.println();
return dmaxmin;
@ -545,7 +758,7 @@ public class PileBuilder implements RecordPileReceiver {
// IndexedSequence [] seqs = testSet3(K); // initialize test set data
IndexedSequence [] seqs = testSet4(K); // initialize test set data
PileBuilder pb = new PileBuilder();
PileBuilder pb = new PileBuilder("test1.bam",null,new DiscardingReceiver());
//pb.doMultipleAlignment(seqs);
pb.doMultipleAlignment2(seqs);

View File

@ -91,6 +91,8 @@ public class SWPairwiseAlignment {
PrimitivePair.Int p = new PrimitivePair.Int();
int maxscore = 0;
int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
// look for largest score. we use >= combined with the traversal direction
// to ensure that if two scores are equal, the one closer to diagonal gets picked
for ( int i = 1 ; i < n+1 ; i++ ) {
@ -102,6 +104,7 @@ public class SWPairwiseAlignment {
p.first = n;
p.second = j ;
maxscore = sw[n][j];
segment_length = m - j; // end of sequence 2 is overhanging; we will just record it as 'M' segment
}
}
@ -114,7 +117,6 @@ public class SWPairwiseAlignment {
// to that sequence
int state = MSTATE;
int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
int [] scores = new int[3];
@ -160,12 +162,12 @@ public class SWPairwiseAlignment {
case DSTATE: o = CigarOperator.D; break;
}
alignment_offset = p.first - p.second;
segment_length+=p.second;
CigarElement e = new CigarElement(segment_length,o);
lce.add(e);
Collections.reverse(lce);
alignmentCigar = new Cigar(lce);
alignment_offset = p.first - p.second;
}
/** Allows for separate gap opening end extension penalties, no direct backtracking.
@ -198,6 +200,8 @@ public class SWPairwiseAlignment {
PrimitivePair.Int p = new PrimitivePair.Int();
double maxscore = 0.0;
int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
// look for largest score. we use >= combined with the traversal direction
// to ensure that if two scores are equal, the one closer to diagonal gets picked
for ( int i = 1 ; i < n+1 ; i++ ) {
@ -209,6 +213,7 @@ public class SWPairwiseAlignment {
p.first = n;
p.second = j ;
maxscore = sw[n][j];
segment_length = m - j; // end of sequence 2 is overhanging; we will just record it as 'M' segment
}
}
@ -221,7 +226,6 @@ public class SWPairwiseAlignment {
// to that sequence
int state = MSTATE;
int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
double [] scores = new double[3];
@ -268,12 +272,12 @@ public class SWPairwiseAlignment {
case ISTATE: o = CigarOperator.I; break;
case DSTATE: o = CigarOperator.D; break;
}
alignment_offset = p.first - p.second;
segment_length+=p.second;
CigarElement e = new CigarElement(segment_length,o);
lce.add(e);
Collections.reverse(lce);
alignmentCigar = new Cigar(lce);
alignment_offset = p.first - p.second ;
}
@ -342,6 +346,8 @@ public void align3(String a, String b) {
PrimitivePair.Int p = new PrimitivePair.Int();
double maxscore = 0.0;
int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
// look for largest score. we use >= combined with the traversal direction
// to ensure that if two scores are equal, the one closer to diagonal gets picked
for ( int i = 1 ; i < n+1 ; i++ ) {
@ -353,6 +359,7 @@ public void align3(String a, String b) {
p.first = n;
p.second = j ;
maxscore = sw[n][j];
segment_length = m - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment
}
}
@ -365,7 +372,6 @@ public void align3(String a, String b) {
// to that sequence
int state = MSTATE;
int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
double [] scores = new double[3];
@ -419,12 +425,12 @@ public void align3(String a, String b) {
case ISTATE: o = CigarOperator.I; break;
case DSTATE: o = CigarOperator.D; break;
}
alignment_offset = p.first - p.second;
segment_length+=p.second;
CigarElement e = new CigarElement(segment_length,o);
lce.add(e);
Collections.reverse(lce);
alignmentCigar = new Cigar(lce);
alignment_offset = p.first - p.second;
}
public void align4(String a, String b) {
@ -485,6 +491,8 @@ public void align3(String a, String b) {
PrimitivePair.Int p = new PrimitivePair.Int();
double maxscore = 0.0;
int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
// look for largest score. we use >= combined with the traversal direction
// to ensure that if two scores are equal, the one closer to diagonal gets picked
for ( int i = 1 ; i < n+1 ; i++ ) {
@ -496,6 +504,7 @@ public void align3(String a, String b) {
p.first = n;
p.second = j ;
maxscore = sw[n][j];
segment_length = m - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment
}
}
@ -508,7 +517,6 @@ public void align3(String a, String b) {
// to that sequence
int state = MSTATE;
int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
double [] scores = new double[3];
@ -558,12 +566,12 @@ public void align3(String a, String b) {
case ISTATE: o = CigarOperator.I; break;
case DSTATE: o = CigarOperator.D; break;
}
alignment_offset = p.first - p.second;
segment_length+=p.second;
CigarElement e = new CigarElement(segment_length,o);
lce.add(e);
Collections.reverse(lce);
alignmentCigar = new Cigar(lce);
alignment_offset = p.first - p.second;
}
private int w(char x, char y) {
@ -688,17 +696,21 @@ public void align3(String a, String b) {
}
// now pos1 = alignment_offset
}
System.out.println(AlignmentUtils.toString(getCigar()));
System.out.println("seq1l="+s1.length()+"; seq2l=" + s2.length());
System.out.println("offset="+alignment_offset);
try {
System.out.println("pos1="+pos1+"; pos2=" + pos2);
/* debug prints: */
// System.out.println(AlignmentUtils.toString(getCigar()));
// System.out.println("seq1l="+s1.length()+"; seq2l=" + s2.length());
// System.out.println("offset="+alignment_offset);
// System.out.println("pos1="+pos1+"; pos2=" + pos2);
/**/
for ( int i = 0 ; i < getCigar().numCigarElements() ; i++ ) {
CigarElement ce = getCigar().getCigarElement(i) ;
switch( ce.getOperator() ) {
case M:
for ( int k = 0 ; k < ce.getLength() ; k++ ) {
int z = ( i == 0 ? pos2 : 0); // if we are in the first element and seq overhangs to the left,
// start inside the first segment, at the position where actual matches begin
// check separately for pos1 < s1.length() since seq2 is allowed to overhang beyond seq1's end
for ( ; z < ce.getLength() && pos1 < s1.length() ; z++ ) {
// System.out.println("pos1="+pos1+"; pos2="+pos2+"; k="+z);
if ( Character.toUpperCase(s1.charAt(pos1)) !=
Character.toUpperCase(s2.charAt(pos2)) ) bmm.append('*');
else bmm.append(' ');
@ -722,10 +734,7 @@ public void align3(String a, String b) {
break;
}
}
} catch(Exception e) {}
b1.append("<---");
b2.append("<---");
bmm.append("<---");
bmm.append('\n');
b1.append(s1,pos1,s1.length());
bmm.append(b1);
@ -733,6 +742,7 @@ public void align3(String a, String b) {
b2.append(s2,pos2,s2.length());
bmm.append(b2);
bmm.append('\n');
return bmm.toString();
}

View File

@ -231,7 +231,7 @@ public class SequencePile {
if ( refC == '+' ) {
// count number of observations for insertion
for ( int j = 0 ; j < col.size() ; j++ ) {
if ( col.charAt(j) != '*' ) count++;
if ( col.charAt(j) != '*' && col.charAt(j) != ' ') count++;
}
} else {
if ( headerGrid.charAt(i) == '-' ) {