massive changes everywhere; lots of bugs fixed; methods moved around; computation and printout of overall stats added; now decides whether to accept or reject 'improvement'; writes alignments into two output sam files (unmodified reads/failed piles into one, realigned piles into the other); special treat for paranoids: writes third sam file with all the analyzed reads, unmodified
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@197 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
0331cd8e95
commit
f47a214f96
|
|
@ -113,8 +113,14 @@ public class AlignmentUtils {
|
||||||
Indel curr_indel = null;
|
Indel curr_indel = null;
|
||||||
|
|
||||||
switch(ce.getOperator()) {
|
switch(ce.getOperator()) {
|
||||||
case I: curr_indel = new Indel(runninglength, ce.getLength(), Indel.IndelType.I); break;
|
case I:
|
||||||
|
curr_indel = new Indel(runninglength, ce.getLength(), Indel.IndelType.I);
|
||||||
|
if ( i == 0 ) System.out.println("WARNING: Indel at start!");
|
||||||
|
if ( i == c.numCigarElements() - 1) System.out.println("WARNING: Indel at end!");
|
||||||
|
break;
|
||||||
case D: curr_indel = new Indel(runninglength, ce.getLength(), Indel.IndelType.D);
|
case D: curr_indel = new Indel(runninglength, ce.getLength(), Indel.IndelType.D);
|
||||||
|
if ( i == 0 ) System.out.println("WARNING: Indel at start!");
|
||||||
|
if ( i == c.numCigarElements() - 1) System.out.println("WARNING: Indel at end!");
|
||||||
runninglength += ce.getLength();
|
runninglength += ce.getLength();
|
||||||
break;
|
break;
|
||||||
case M: runninglength += ce.getLength(); break; // advance along the gapless block in the alignment
|
case M: runninglength += ce.getLength(); break; // advance along the gapless block in the alignment
|
||||||
|
|
@ -152,6 +158,7 @@ public class AlignmentUtils {
|
||||||
for ( Indel ind : indels ) {
|
for ( Indel ind : indels ) {
|
||||||
CountedObject<Indel> ci = new CountedObject<Indel>(ind);
|
CountedObject<Indel> ci = new CountedObject<Indel>(ind);
|
||||||
CountedObject<Indel> found = t.floor(ci);
|
CountedObject<Indel> found = t.floor(ci);
|
||||||
|
// CountedObject<Indel> found2 = t.ceiling(ci);
|
||||||
|
|
||||||
if ( ci.equals( found ) ) found.increment(); // we did find our indel, advance the counter
|
if ( ci.equals( found ) ) found.increment(); // we did find our indel, advance the counter
|
||||||
else t.add(ci); // this is a new indel. Add it.
|
else t.add(ci); // this is a new indel. Add it.
|
||||||
|
|
|
||||||
|
|
@ -114,7 +114,7 @@ public class Indel implements Interval {
|
||||||
* @param i Another interval
|
* @param i Another interval
|
||||||
* @return true iff intervals overlap
|
* @return true iff intervals overlap
|
||||||
*/
|
*/
|
||||||
@Override
|
|
||||||
public boolean overlapsP(Interval i) {
|
public boolean overlapsP(Interval i) {
|
||||||
return ! disjointP(i); //To change body of implemented methods use File | Settings | File Templates.
|
return ! disjointP(i); //To change body of implemented methods use File | Settings | File Templates.
|
||||||
}
|
}
|
||||||
|
|
@ -126,7 +126,6 @@ public class Indel implements Interval {
|
||||||
* @param i Another interval
|
* @param i Another interval
|
||||||
* @return true iff intervals do not overlap
|
* @return true iff intervals do not overlap
|
||||||
*/
|
*/
|
||||||
@Override
|
|
||||||
public boolean disjointP(Interval i) {
|
public boolean disjointP(Interval i) {
|
||||||
return i.getStop() < this.getStart() || i.getStart() > this.getStop();
|
return i.getStop() < this.getStart() || i.getStart() > this.getStop();
|
||||||
}
|
}
|
||||||
|
|
@ -135,7 +134,6 @@ public class Indel implements Interval {
|
||||||
* has length of 0.
|
* has length of 0.
|
||||||
* @return length of the event on the original, unmodified reference
|
* @return length of the event on the original, unmodified reference
|
||||||
*/
|
*/
|
||||||
@Override
|
|
||||||
public long getLength() {
|
public long getLength() {
|
||||||
if ( mType == IndelType.I ) return 0;
|
if ( mType == IndelType.I ) return 0;
|
||||||
return mLength;
|
return mLength;
|
||||||
|
|
@ -150,6 +148,6 @@ public class Indel implements Interval {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
return (int)( mStart << 2 + mLength );
|
return (int)( mStart << 6 + mStart + mLength );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -11,8 +11,6 @@ import javax.swing.filechooser.FileNameExtensionFilter;
|
||||||
import edu.mit.broad.picard.cmdline.CommandLineProgram;
|
import edu.mit.broad.picard.cmdline.CommandLineProgram;
|
||||||
import edu.mit.broad.picard.cmdline.Option;
|
import edu.mit.broad.picard.cmdline.Option;
|
||||||
import edu.mit.broad.picard.cmdline.Usage;
|
import edu.mit.broad.picard.cmdline.Usage;
|
||||||
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
|
|
||||||
import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory;
|
|
||||||
import edu.mit.broad.picard.reference.ReferenceSequenceFileWalker;
|
import edu.mit.broad.picard.reference.ReferenceSequenceFileWalker;
|
||||||
import edu.mit.broad.picard.reference.ReferenceSequence;
|
import edu.mit.broad.picard.reference.ReferenceSequence;
|
||||||
|
|
||||||
|
|
@ -25,6 +23,10 @@ public class IndelInspector extends CommandLineProgram {
|
||||||
@Usage(programVersion="1.0") public String USAGE = "Investigates indels called in the alignment data\n";
|
@Usage(programVersion="1.0") public String USAGE = "Investigates indels called in the alignment data\n";
|
||||||
@Option(shortName="I", doc="SAM or BAM file for calling",optional=true) public File INPUT_FILE;
|
@Option(shortName="I", doc="SAM or BAM file for calling",optional=true) public File INPUT_FILE;
|
||||||
@Option(shortName="L",doc="Genomic interval to run on, as contig[:start[-stop]]; whole genome if not specified", optional=true) public String GENOME_LOCATION;
|
@Option(shortName="L",doc="Genomic interval to run on, as contig[:start[-stop]]; whole genome if not specified", optional=true) public String GENOME_LOCATION;
|
||||||
|
@Option(shortName="V",doc="Verbosity level: SILENT, PILESUMMARY, ALIGNMENTS", optional=true) public String VERBOSITY_LEVEL;
|
||||||
|
@Option(doc="Output file (sam or bam) for non-indel related reads and indel reads that were not improved") public String OUT1;
|
||||||
|
@Option(doc="Output file (sam or bam) for improved (realigned) indel related reads") public String OUT2;
|
||||||
|
@Option(doc="[paranoid] Output \"control\" file (sam or bam): all reads picked and processed by this tool will be also saved, unmodified, into this file", optional=true) public String OUTC;
|
||||||
@Option(doc="Error counting mode: MM - count mismatches only, ERR - count errors (arachne style), MG - count mismatches and gaps as one error each") public String ERR_MODE;
|
@Option(doc="Error counting mode: MM - count mismatches only, ERR - count errors (arachne style), MG - count mismatches and gaps as one error each") public String ERR_MODE;
|
||||||
@Option(doc="Maximum number of errors allowed (see ERR_MODE)") public Integer MAX_ERRS;
|
@Option(doc="Maximum number of errors allowed (see ERR_MODE)") public Integer MAX_ERRS;
|
||||||
// @Option(shortName="R", doc="Reference fasta or fasta.gz file") public File REF_FILE;
|
// @Option(shortName="R", doc="Reference fasta or fasta.gz file") public File REF_FILE;
|
||||||
|
|
@ -46,7 +48,7 @@ public class IndelInspector extends CommandLineProgram {
|
||||||
System.out.println("Unknown value specified for ERR_MODE");
|
System.out.println("Unknown value specified for ERR_MODE");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
final SAMFileReader samReader = new SAMFileReader(getInputFile(INPUT_FILE,"/broad/1KG/"));
|
final SAMFileReader samReader = new SAMFileReader(getInputFile(INPUT_FILE,"/broad/1KG/"));
|
||||||
samReader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT);
|
samReader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT);
|
||||||
|
|
||||||
|
|
@ -60,12 +62,26 @@ public class IndelInspector extends CommandLineProgram {
|
||||||
ReferenceSequence contig_seq = null;
|
ReferenceSequence contig_seq = null;
|
||||||
|
|
||||||
IndelRecordPileCollector col = null;
|
IndelRecordPileCollector col = null;
|
||||||
PileBuilder pileBuilder = new PileBuilder();
|
PassThroughWriter ptWriter = new PassThroughWriter(OUT1,samReader.getFileHeader());
|
||||||
|
PileBuilder pileBuilder = new PileBuilder(OUT2,samReader.getFileHeader(),ptWriter);
|
||||||
|
|
||||||
|
SAMFileWriter controlWriter = null;
|
||||||
|
if ( OUTC != null ) controlWriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(samReader.getFileHeader(),false,new File(OUTC));
|
||||||
|
|
||||||
try {
|
try {
|
||||||
col = new IndelRecordPileCollector(new DiscardingReceiver(), pileBuilder );
|
col = new IndelRecordPileCollector(ptWriter, pileBuilder, controlWriter );
|
||||||
} catch(Exception e) { System.err.println(e.getMessage()); }
|
} catch(Exception e) { System.err.println(e.getMessage()); }
|
||||||
if ( col == null ) return 1;
|
if ( col == null ) return 1;
|
||||||
|
|
||||||
|
if ( VERBOSITY_LEVEL == null ) VERBOSITY_LEVEL = new String("SILENT");
|
||||||
|
if ( VERBOSITY_LEVEL.toUpperCase().equals("SILENT")) pileBuilder.setVerbosity(pileBuilder.SILENT);
|
||||||
|
else if ( VERBOSITY_LEVEL.toUpperCase().equals("PILESUMMARY") ) pileBuilder.setVerbosity(pileBuilder.PILESUMMARY);
|
||||||
|
else if ( VERBOSITY_LEVEL.toUpperCase().equals("ALIGNMENTS") ) pileBuilder.setVerbosity(pileBuilder.ALIGNMENTS);
|
||||||
|
else {
|
||||||
|
System.out.println("Unrecognized VERBOSITY_LEVEL setting.");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
String cur_contig = null;
|
String cur_contig = null;
|
||||||
int counter = 0;
|
int counter = 0;
|
||||||
|
|
||||||
|
|
@ -120,9 +136,14 @@ public class IndelInspector extends CommandLineProgram {
|
||||||
col.receive(r);
|
col.receive(r);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pileBuilder.printStats();
|
||||||
System.out.println("done.");
|
System.out.println("done.");
|
||||||
col.printLengthHistograms();
|
col.printLengthHistograms();
|
||||||
samReader.close();
|
samReader.close();
|
||||||
|
pileBuilder.close();
|
||||||
|
ptWriter.close();
|
||||||
|
if ( controlWriter != null ) controlWriter.close();
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -89,6 +89,8 @@ public class IndelRecordPileCollector implements RecordReceiver {
|
||||||
private RecordReceiver defaultReceiver; // we will send there records that do not overlap with regions of interest
|
private RecordReceiver defaultReceiver; // we will send there records that do not overlap with regions of interest
|
||||||
private RecordPileReceiver indelPileReceiver; // piles over indel regions will be sent there
|
private RecordPileReceiver indelPileReceiver; // piles over indel regions will be sent there
|
||||||
|
|
||||||
|
private SAMFileWriter controlWriter;
|
||||||
|
|
||||||
private String referenceSequence;
|
private String referenceSequence;
|
||||||
|
|
||||||
public String memStatsString() {
|
public String memStatsString() {
|
||||||
|
|
@ -97,7 +99,12 @@ public class IndelRecordPileCollector implements RecordReceiver {
|
||||||
//+" Bndries="+mIndelRegionStart +":"+ mIndelRegionStop;
|
//+" Bndries="+mIndelRegionStart +":"+ mIndelRegionStop;
|
||||||
}
|
}
|
||||||
|
|
||||||
public IndelRecordPileCollector(RecordReceiver rr, RecordPileReceiver rp) throws java.io.IOException {
|
public IndelRecordPileCollector(RecordReceiver rr, RecordPileReceiver rp) throws java.io.IOException {
|
||||||
|
this(rr,rp,null);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public IndelRecordPileCollector(RecordReceiver rr, RecordPileReceiver rp, SAMFileWriter cw) throws java.io.IOException {
|
||||||
mRecordPile = new LinkedList<SAMRecord>();
|
mRecordPile = new LinkedList<SAMRecord>();
|
||||||
mAllIndels = new TreeSet<CountedObject<Indel> >(
|
mAllIndels = new TreeSet<CountedObject<Indel> >(
|
||||||
new CountedObjectComparatorAdapter<Indel>(new IntervalComparator()));
|
new CountedObjectComparatorAdapter<Indel>(new IntervalComparator()));
|
||||||
|
|
@ -113,6 +120,7 @@ public class IndelRecordPileCollector implements RecordReceiver {
|
||||||
defaultReceiver = rr;
|
defaultReceiver = rr;
|
||||||
indelPileReceiver = rp;
|
indelPileReceiver = rp;
|
||||||
referenceSequence = null;
|
referenceSequence = null;
|
||||||
|
controlWriter = cw;
|
||||||
setWaitState();
|
setWaitState();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -146,6 +154,7 @@ public class IndelRecordPileCollector implements RecordReceiver {
|
||||||
SAMRecord r = i.next();
|
SAMRecord r = i.next();
|
||||||
if ( r.getAlignmentEnd() <= pos ) {
|
if ( r.getAlignmentEnd() <= pos ) {
|
||||||
defaultReceiver.receive(r);
|
defaultReceiver.receive(r);
|
||||||
|
if ( controlWriter != null ) controlWriter.addAlignment(r);
|
||||||
i.remove();
|
i.remove();
|
||||||
} else break;
|
} else break;
|
||||||
}
|
}
|
||||||
|
|
@ -161,6 +170,7 @@ public class IndelRecordPileCollector implements RecordReceiver {
|
||||||
SAMRecord r = i.next();
|
SAMRecord r = i.next();
|
||||||
if ( r.getAlignmentStart() >= pos ) {
|
if ( r.getAlignmentStart() >= pos ) {
|
||||||
defaultReceiver.receive(r);
|
defaultReceiver.receive(r);
|
||||||
|
if ( controlWriter != null ) controlWriter.addAlignment(r);
|
||||||
i.remove();
|
i.remove();
|
||||||
} else break;
|
} else break;
|
||||||
}
|
}
|
||||||
|
|
@ -250,7 +260,10 @@ public class IndelRecordPileCollector implements RecordReceiver {
|
||||||
}
|
}
|
||||||
|
|
||||||
// no indels or avoiding indels in bad region: send all records to defaultReceiver and clear the pile
|
// no indels or avoiding indels in bad region: send all records to defaultReceiver and clear the pile
|
||||||
for ( SAMRecord r : mRecordPile ) defaultReceiver.receive(r);
|
for ( SAMRecord r : mRecordPile ) {
|
||||||
|
defaultReceiver.receive(r);
|
||||||
|
if ( controlWriter != null ) controlWriter.addAlignment(r);
|
||||||
|
}
|
||||||
setWaitState();
|
setWaitState();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
@ -325,7 +338,13 @@ public class IndelRecordPileCollector implements RecordReceiver {
|
||||||
System.out.print(finalPile.size() + " reads in the pile;") ;
|
System.out.print(finalPile.size() + " reads in the pile;") ;
|
||||||
System.out.println(formatRange(finalTrain));
|
System.out.println(formatRange(finalTrain));
|
||||||
indelPileReceiver.receive(finalPile);
|
indelPileReceiver.receive(finalPile);
|
||||||
} else for ( SAMRecord r : finalPile ) defaultReceiver.receive(r);
|
if ( controlWriter != null ) for ( SAMRecord r : finalPile ) controlWriter.addAlignment(r);
|
||||||
|
} else {
|
||||||
|
for ( SAMRecord r : finalPile ) {
|
||||||
|
defaultReceiver.receive(r);
|
||||||
|
controlWriter.addAlignment(r);
|
||||||
|
}
|
||||||
|
}
|
||||||
finalPile.clear();
|
finalPile.clear();
|
||||||
finalTrain.clear();
|
finalTrain.clear();
|
||||||
curr_stop = -1;
|
curr_stop = -1;
|
||||||
|
|
|
||||||
|
|
@ -240,7 +240,7 @@ public class MultipleAlignment implements Iterable<Integer> {
|
||||||
/** Returns a (multiline) string that represents the alignment visually: the sequences are appropriately
|
/** Returns a (multiline) string that represents the alignment visually: the sequences are appropriately
|
||||||
* shifted and ready for printout;
|
* shifted and ready for printout;
|
||||||
*/
|
*/
|
||||||
public String toString(boolean inorder) {
|
public String toString(boolean inorder, boolean dotprint) {
|
||||||
|
|
||||||
StringBuilder b = new StringBuilder();
|
StringBuilder b = new StringBuilder();
|
||||||
java.util.Formatter frmt = new java.util.Formatter(b);
|
java.util.Formatter frmt = new java.util.Formatter(b);
|
||||||
|
|
@ -278,8 +278,16 @@ public class MultipleAlignment implements Iterable<Integer> {
|
||||||
for ( int i = 0 ; i < seqs.size() ; i++ ) {
|
for ( int i = 0 ; i < seqs.size() ; i++ ) {
|
||||||
int index = (inorder ? perm[i] : i);
|
int index = (inorder ? perm[i] : i);
|
||||||
frmt.format("%3d:", ext_ids.get(index));
|
frmt.format("%3d:", ext_ids.get(index));
|
||||||
skipN(alignment_offsets.get(index)+ first_offset,b);
|
int pos = alignment_offsets.get(index)+ first_offset; // start position on the consensus sequence
|
||||||
b.append(seqs.get(index));
|
skipN(pos,b);
|
||||||
|
String aSeq = seqs.get(index);
|
||||||
|
if ( dotprint ) {
|
||||||
|
for ( int j = 0 ; j < aSeq.length() ; j++, pos++ ) {
|
||||||
|
if ( Character.toUpperCase(aSeq.charAt(j)) ==
|
||||||
|
Character.toUpperCase(consensusString[3][pos]) ) b.append('.');
|
||||||
|
else b.append(aSeq.charAt(j));
|
||||||
|
}
|
||||||
|
} else b.append(aSeq);
|
||||||
b.append('\n');
|
b.append('\n');
|
||||||
}
|
}
|
||||||
// b.append(best_mm+" mismatches, "+ next_mm + " next best, " + getOverlap() + " overlapping bases, distance=" + distance() + "\n");
|
// b.append(best_mm+" mismatches, "+ next_mm + " next best, " + getOverlap() + " overlapping bases, distance=" + distance() + "\n");
|
||||||
|
|
@ -290,7 +298,7 @@ public class MultipleAlignment implements Iterable<Integer> {
|
||||||
return consensus.getSequence();
|
return consensus.getSequence();
|
||||||
}
|
}
|
||||||
|
|
||||||
public String toString() { return toString(true); }
|
public String toString() { return toString(true, false); }
|
||||||
|
|
||||||
public int size() { return seqs.size(); }
|
public int size() { return seqs.size(); }
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,34 @@
|
||||||
|
package org.broadinstitute.sting.playground.indels;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import net.sf.samtools.SAMFileWriter;
|
||||||
|
import net.sf.samtools.SAMFileWriterFactory;
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: asivache
|
||||||
|
* Date: Mar 25, 2009
|
||||||
|
* Time: 8:27:09 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class PassThroughWriter implements RecordReceiver {
|
||||||
|
private SAMFileWriter writer;
|
||||||
|
|
||||||
|
public PassThroughWriter( File f, SAMFileHeader h) {
|
||||||
|
writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(h, false, f);
|
||||||
|
}
|
||||||
|
|
||||||
|
public PassThroughWriter(String s, SAMFileHeader h) {
|
||||||
|
this(new File(s), h);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void receive(SAMRecord r) {
|
||||||
|
//To change body of implemented methods use File | Settings | File Templates.
|
||||||
|
writer.addAlignment(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() { writer.close() ; }
|
||||||
|
}
|
||||||
|
|
@ -3,6 +3,8 @@ package org.broadinstitute.sting.playground.indels;
|
||||||
import net.sf.samtools.*;
|
import net.sf.samtools.*;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.io.File;
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.PrimitivePair;
|
import org.broadinstitute.sting.utils.PrimitivePair;
|
||||||
import org.broadinstitute.sting.playground.utils.CountedObject;
|
import org.broadinstitute.sting.playground.utils.CountedObject;
|
||||||
import org.broadinstitute.sting.playground.utils.CountedObjectComparatorAdapter;
|
import org.broadinstitute.sting.playground.utils.CountedObjectComparatorAdapter;
|
||||||
|
|
@ -17,6 +19,30 @@ public class PileBuilder implements RecordPileReceiver {
|
||||||
private String referenceSequence;
|
private String referenceSequence;
|
||||||
private int reference_start;
|
private int reference_start;
|
||||||
|
|
||||||
|
private int processed_piles = 0;
|
||||||
|
private int improved_piles = 0;
|
||||||
|
private int unmodified_piles = 0;
|
||||||
|
private int failed_piles = 0;
|
||||||
|
private int indels_improved = 0;
|
||||||
|
private int indel_improvement_cnt = 0;
|
||||||
|
private int indels_discarded = 0;
|
||||||
|
private int indels_added = 0;
|
||||||
|
private int indels_added_cnt = 0;
|
||||||
|
private int total_mismatches_count_in_improved = 0;
|
||||||
|
private int total_mismatches_count_in_failed = 0;
|
||||||
|
private int total_improved_mismatches_count = 0;
|
||||||
|
private int total_reads_in_improved = 0;
|
||||||
|
private int total_reads_in_failed = 0;
|
||||||
|
private int total_alignments_modified = 0;
|
||||||
|
|
||||||
|
public final static int SILENT = 0;
|
||||||
|
public final static int PILESUMMARY = 1;
|
||||||
|
public final static int ALIGNMENTS = 2;
|
||||||
|
|
||||||
|
private int mVerbosityLevel = SILENT;
|
||||||
|
|
||||||
|
private SAMFileWriter samWriter;
|
||||||
|
private RecordReceiver failedPileReceiver;
|
||||||
|
|
||||||
private static class SelectedPair {
|
private static class SelectedPair {
|
||||||
private int i_;
|
private int i_;
|
||||||
|
|
@ -68,9 +94,15 @@ public class PileBuilder implements RecordPileReceiver {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public PileBuilder() {
|
public PileBuilder(File f, SAMFileHeader h, RecordReceiver fr) {
|
||||||
|
samWriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(h,false,f);
|
||||||
referenceSequence = null;
|
referenceSequence = null;
|
||||||
reference_start = -1;
|
reference_start = -1;
|
||||||
|
failedPileReceiver = fr;
|
||||||
|
}
|
||||||
|
|
||||||
|
public PileBuilder(String s, SAMFileHeader h, RecordReceiver fr) {
|
||||||
|
this(new File(s),h, fr);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setReferenceSequence(String seq, int start) {
|
public void setReferenceSequence(String seq, int start) {
|
||||||
|
|
@ -84,6 +116,10 @@ public class PileBuilder implements RecordPileReceiver {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void receive(Collection<SAMRecord> c) {
|
public void receive(Collection<SAMRecord> c) {
|
||||||
|
|
||||||
|
//TODO: if read starts/ends with an indel (insertion, actually), we detect this as a "different" indel introduced during cleanup.
|
||||||
|
processed_piles++;
|
||||||
|
|
||||||
IndexedSequence[] seqs = new IndexedSequence[c.size()];
|
IndexedSequence[] seqs = new IndexedSequence[c.size()];
|
||||||
int i = 0;
|
int i = 0;
|
||||||
int startOnRef = 1000000000; // absolute start (leftmost) position of the pile of reads on the ref
|
int startOnRef = 1000000000; // absolute start (leftmost) position of the pile of reads on the ref
|
||||||
|
|
@ -101,18 +137,24 @@ public class PileBuilder implements RecordPileReceiver {
|
||||||
TreeSet< CountedObject<Indel> > all_indels = new TreeSet< CountedObject<Indel> >(
|
TreeSet< CountedObject<Indel> > all_indels = new TreeSet< CountedObject<Indel> >(
|
||||||
new CountedObjectComparatorAdapter<Indel>(new IntervalComparator()));
|
new CountedObjectComparatorAdapter<Indel>(new IntervalComparator()));
|
||||||
|
|
||||||
SequencePile originalAligns = new SequencePile(pileRef);
|
SequencePile originalAligns = null;
|
||||||
|
if ( mVerbosityLevel >= ALIGNMENTS ) originalAligns = new SequencePile(pileRef);
|
||||||
|
|
||||||
for ( SAMRecord r : c ) {
|
for ( SAMRecord r : c ) {
|
||||||
originalAligns.addAlignedSequence(r.getReadString(), r.getReadNegativeStrandFlag(),
|
if ( mVerbosityLevel >= ALIGNMENTS ) {
|
||||||
r.getCigar(), r.getAlignmentStart() - startOnRef );
|
originalAligns.addAlignedSequence(r.getReadString(), r.getReadNegativeStrandFlag(),
|
||||||
|
r.getCigar(), r.getAlignmentStart() - startOnRef );
|
||||||
|
}
|
||||||
totalMismatches += AlignmentUtils.numMismatches(r,referenceSequence);
|
totalMismatches += AlignmentUtils.numMismatches(r,referenceSequence);
|
||||||
AlignmentUtils.collectAndCountIndels(r,all_indels);
|
AlignmentUtils.collectAndCountIndels(r,all_indels);
|
||||||
}
|
}
|
||||||
|
|
||||||
System.out.println("\n#############################################################################");
|
if ( mVerbosityLevel >= ALIGNMENTS ) {
|
||||||
System.out.println("ORIGINAL ALIGNMENT: \n");
|
System.out.println("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++");
|
||||||
originalAligns.dotprint(true);
|
System.out.println("ORIGINAL ALIGNMENT: \n");
|
||||||
System.out.println("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") ;
|
originalAligns.dotprint(true);
|
||||||
|
System.out.println("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") ;
|
||||||
|
}
|
||||||
|
|
||||||
List<MultipleAlignment> piles = doMultipleAlignment2(seqs);
|
List<MultipleAlignment> piles = doMultipleAlignment2(seqs);
|
||||||
|
|
||||||
|
|
@ -120,91 +162,244 @@ public class PileBuilder implements RecordPileReceiver {
|
||||||
// System.out.print("; diameter of PILE1: "+ diameter(alignments1));
|
// System.out.print("; diameter of PILE1: "+ diameter(alignments1));
|
||||||
// System.out.println("; diameter of PILE2: "+ diameter(alignments2));
|
// System.out.println("; diameter of PILE2: "+ diameter(alignments2));
|
||||||
|
|
||||||
SymmetricMatrix d = new SymmetricMatrix(piles.size());
|
SymmetricMatrix d = new SymmetricMatrix(piles.size());
|
||||||
for ( int n = 0 ; n < piles.size() ; n++ ) {
|
for ( int n = 0 ; n < piles.size() ; n++ ) {
|
||||||
d.set(n,n,diameter(piles.get(n)));
|
d.set(n,n,diameter(piles.get(n)));
|
||||||
for ( int m = n+1 ; m < piles.size() ; m++ ) {
|
for ( int m = n+1 ; m < piles.size() ; m++ ) {
|
||||||
d.set(n,m,distance(piles.get(n), piles.get(m)));
|
d.set(n,m,distance(piles.get(n), piles.get(m)));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
int new_mismatches = 0 ; // number of mismatches after re-alignment:
|
int new_mismatches = 0 ; // number of mismatches after re-alignment:
|
||||||
TreeSet< CountedObject<Indel> > new_indels = new TreeSet< CountedObject<Indel> >(
|
TreeSet< CountedObject<Indel> > new_indels = new TreeSet< CountedObject<Indel> >(
|
||||||
new CountedObjectComparatorAdapter<Indel>(new IntervalComparator())
|
new CountedObjectComparatorAdapter<Indel>(new IntervalComparator())
|
||||||
); // new indels after realignment
|
); // new indels after realignment
|
||||||
int shifted_reads = 0;
|
int shifted_reads = 0;
|
||||||
int smashed_reads = 0;
|
int smashed_reads = 0;
|
||||||
|
|
||||||
List<SAMRecord> as_list = (List<SAMRecord>)c; // ugly hack; need this to access records by ids
|
List<SAMRecord> as_list = (List<SAMRecord>)c; // ugly hack; need this to access records by ids
|
||||||
|
|
||||||
System.out.println(d.format("%8.4g"));
|
if ( mVerbosityLevel >= PILESUMMARY ) System.out.println(d.format("%8.4g"));
|
||||||
for ( int n = 0 ; n < piles.size() ; n++ ) {
|
|
||||||
|
for ( int n = 0 ; n < piles.size() ; n++ ) {
|
||||||
// SWPairwiseAlignment consToRef = new SWPairwiseAlignment(pileRef,piles.get(n).getConsensus(),2.0,-10.0,-2.0,-1.0);
|
// SWPairwiseAlignment consToRef = new SWPairwiseAlignment(pileRef,piles.get(n).getConsensus(),2.0,-10.0,-2.0,-1.0);
|
||||||
|
SWPairwiseAlignment consToRef = new SWPairwiseAlignment(pileRef,piles.get(n).getConsensus(),3.0,-1.0,-4.0,-0.5);
|
||||||
|
|
||||||
|
if ( mVerbosityLevel >= ALIGNMENTS ) {
|
||||||
|
|
||||||
|
System.out.println("PILE " + n + " to REF ("+ (consToRef.getCigar().numCigarElements()-1)/2 +" indels):");
|
||||||
|
System.out.println(consToRef.toString());
|
||||||
|
System.out.println("PILE " + n +" (READS):\n" +piles.get(n).toString(true,true));
|
||||||
|
}
|
||||||
|
// SequencePile pileAligns = new SequencePile(pileRef);
|
||||||
|
|
||||||
|
MultipleAlignment ma = piles.get(n);
|
||||||
|
for ( Integer id : ma ) {
|
||||||
|
SAMRecord r = as_list.get(id);
|
||||||
|
int cons_offset = ma.getOffsetWrtConsensus(id); // offset of the read 'id' wrt multiple alignment's full consensus seq
|
||||||
|
|
||||||
|
/*
|
||||||
|
System.out.println("id=" + id +": offset on consensus="+cons_offset+
|
||||||
|
"; consensus wrt ref chunk="+consToRef.getAlignmentStart2wrt1()+"; chunk start="+startOnRef);
|
||||||
|
*/
|
||||||
|
|
||||||
|
int ref_offset = cons_offset + startOnRef + consToRef.getAlignmentStart2wrt1()+indelCorrection(cons_offset,consToRef.getCigar());
|
||||||
|
if ( ref_offset != r.getAlignmentStart()) shifted_reads++;
|
||||||
|
Cigar cig = buildCigar(cons_offset, r.getReadLength(), consToRef.getCigar());
|
||||||
|
/*
|
||||||
|
if ( id == 9 ) {
|
||||||
|
System.out.println("ref_offset="+ref_offset+"; orig_ref_off="+r.getAlignmentStart()+"; "+
|
||||||
|
AlignmentUtils.toString(cig));
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println("adding "+id+" at "+ (ref_offset - refStarttemp));
|
||||||
|
pileAligns.addAlignedSequence(r.getReadString(), r.getReadNegativeStrandFlag(), cig, ref_offset - refStarttemp);
|
||||||
|
*/
|
||||||
|
if ( cig.numCigarElements() != r.getCigar().numCigarElements() ) smashed_reads++;
|
||||||
|
|
||||||
|
if ( ref_offset != r.getAlignmentStart() || cig.numCigarElements() != r.getCigar().numCigarElements() ) total_alignments_modified++;
|
||||||
|
|
||||||
|
SAMRecord rtest = new SAMRecord(r.getHeader());
|
||||||
|
rtest.setAlignmentStart(ref_offset);
|
||||||
|
rtest.setReadString(r.getReadString());
|
||||||
|
rtest.setReadUmappedFlag(r.getReadUnmappedFlag());
|
||||||
|
rtest.setCigar(cig);
|
||||||
|
AlignmentUtils.collectAndCountIndels(rtest,new_indels);
|
||||||
|
new_mismatches += AlignmentUtils.numMismatches(rtest,referenceSequence);
|
||||||
|
}
|
||||||
|
// pileAligns.colorprint(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean pile_improved = false;
|
||||||
|
boolean pile_unmodified = false;
|
||||||
|
boolean pile_failed = false;
|
||||||
|
|
||||||
|
double mmChangePct = Math.abs((new_mismatches - totalMismatches)*100.0/totalMismatches);
|
||||||
|
|
||||||
|
if ( shifted_reads == 0 && smashed_reads == 0 ) pile_unmodified = true;
|
||||||
|
else {
|
||||||
|
if ( new_mismatches < totalMismatches ||
|
||||||
|
mmChangePct < 10.0 && ( new_indels.size() < all_indels.size() )
|
||||||
|
) pile_improved = true;
|
||||||
|
else pile_failed = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( pile_improved ) {
|
||||||
|
total_mismatches_count_in_improved +=totalMismatches;
|
||||||
|
total_improved_mismatches_count += new_mismatches;
|
||||||
|
total_reads_in_improved += c.size() ;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( pile_failed ) {
|
||||||
|
total_mismatches_count_in_failed += totalMismatches;
|
||||||
|
total_reads_in_failed += c.size();
|
||||||
|
}
|
||||||
|
int discovered_indels = 0;
|
||||||
|
int discovered_support = 0;
|
||||||
|
int existing_indels = 0;
|
||||||
|
int existing_support = 0;
|
||||||
|
int existing_support_new = 0;
|
||||||
|
int discarded_indels = 0;
|
||||||
|
for ( CountedObject<Indel> ind : new_indels ) {
|
||||||
|
//System.out.print("new indel: "+ind.getObject().getStart()+"+"+ind.getObject().getStop());
|
||||||
|
if ( ! all_indels.contains(ind) ) {
|
||||||
|
//System.out.println(" (DISCOVERED)");
|
||||||
|
discovered_indels++;
|
||||||
|
discovered_support += ind.getCount();
|
||||||
|
if ( pile_improved ) {
|
||||||
|
indels_added++;
|
||||||
|
indels_added_cnt += ind.getCount();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
//System.out.println(" (EXISTING)");
|
||||||
|
existing_indels++;
|
||||||
|
existing_support_new += ind.getCount();
|
||||||
|
if ( pile_improved && ( ind.getCount() > all_indels.floor(ind).getCount() ) ) {
|
||||||
|
if ( ! ind.equals(all_indels.floor(ind))) System.out.println("ERROR MATCHING INDELS!!!") ;
|
||||||
|
indels_improved++;
|
||||||
|
indel_improvement_cnt += ( ind.getCount() - all_indels.floor(ind).getCount() );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for ( CountedObject<Indel> ind : all_indels ) {
|
||||||
|
//System.out.print("old indel: "+ind.getObject().getStart()+"+"+ind.getObject().getStop());
|
||||||
|
if ( ! new_indels.contains(ind )) {
|
||||||
|
//System.out.println(" (DISCARDED)");
|
||||||
|
discarded_indels++;
|
||||||
|
if ( pile_improved ) indels_discarded++;
|
||||||
|
} else {
|
||||||
|
//System.out.println(" (KEPT)");
|
||||||
|
existing_support += ind.getCount();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( pile_improved ) improved_piles++;
|
||||||
|
if ( pile_unmodified ) unmodified_piles++;
|
||||||
|
if ( pile_failed ) failed_piles++;
|
||||||
|
|
||||||
|
if ( mVerbosityLevel >= PILESUMMARY ) {
|
||||||
|
System.out.print("TOTAL MISMATCHES: "+totalMismatches +" --> "+new_mismatches);
|
||||||
|
if ( totalMismatches > new_mismatches ) System.out.print("(-");
|
||||||
|
else System.out.print("(+");
|
||||||
|
System.out.printf("%.2f%%)%n",mmChangePct);
|
||||||
|
|
||||||
|
System.out.println("CONFIRMED INDELS: "+existing_indels);
|
||||||
|
System.out.print("CONFIRMED INDEL SUPPORT: "+existing_support + " --> " + existing_support_new );
|
||||||
|
if ( existing_support > existing_support_new ) System.out.print("(-");
|
||||||
|
else System.out.print("(+");
|
||||||
|
System.out.printf("%.2f%%)%n",Math.abs((existing_support- existing_support_new)*100.0/existing_support));
|
||||||
|
System.out.println("DROPPED INDELS: " + discarded_indels);
|
||||||
|
System.out.println("DISCOVERED INDELS: " + discovered_indels) ;
|
||||||
|
System.out.println("DISCOVERED INDELS SUPPORT: "+discovered_support);
|
||||||
|
System.out.println("ALIGNMENTS SHIFTED: "+shifted_reads);
|
||||||
|
System.out.println("ALIGNMENTS WITH GAPS CHANGED: "+smashed_reads);
|
||||||
|
|
||||||
|
if ( pile_improved ) System.out.println("OUTCOME: IMPROVED");
|
||||||
|
if ( pile_unmodified ) System.out.println("OUTCOME: UNCHANGED");
|
||||||
|
if ( pile_failed ) System.out.println("OUTCOME: FAILED");
|
||||||
|
|
||||||
|
System.out.println("\n#############################################################################\n");
|
||||||
|
}
|
||||||
|
// finally, writing stuff:
|
||||||
|
for ( int n = 0 ; n < piles.size() ; n++ ) {
|
||||||
|
|
||||||
SWPairwiseAlignment consToRef = new SWPairwiseAlignment(pileRef,piles.get(n).getConsensus(),3.0,-1.0,-4.0,-0.5);
|
SWPairwiseAlignment consToRef = new SWPairwiseAlignment(pileRef,piles.get(n).getConsensus(),3.0,-1.0,-4.0,-0.5);
|
||||||
|
|
||||||
System.out.println("PILE " + n + " to REF ("+ (consToRef.getCigar().numCigarElements()-1)/2 +" indels):");
|
|
||||||
System.out.println(consToRef.toString());
|
|
||||||
System.out.println("PILE " + n +" (READS):\n" +piles.get(n).toString());
|
|
||||||
|
|
||||||
MultipleAlignment ma = piles.get(n);
|
MultipleAlignment ma = piles.get(n);
|
||||||
for ( Integer id : ma ) {
|
|
||||||
|
Iterator<Integer> id_iter = ma.sequenceIdByOffsetIterator();
|
||||||
|
while ( id_iter.hasNext() ) {
|
||||||
|
|
||||||
|
int id = id_iter.next();
|
||||||
|
|
||||||
SAMRecord r = as_list.get(id);
|
SAMRecord r = as_list.get(id);
|
||||||
|
if ( pile_failed || pile_unmodified ) {
|
||||||
|
failedPileReceiver.receive(r); // nothing to do, send failed piles directly for writing
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// we improved stuff!! let's reset the alignment parameters!
|
||||||
|
|
||||||
int cons_offset = ma.getOffsetWrtConsensus(id); // offset of the read 'id' wrt multiple alignment's full consensus seq
|
int cons_offset = ma.getOffsetWrtConsensus(id); // offset of the read 'id' wrt multiple alignment's full consensus seq
|
||||||
int ref_offset = cons_offset + startOnRef + consToRef.getAlignmentStart2wrt1();
|
|
||||||
if ( ref_offset != r.getAlignmentStart()) shifted_reads++;
|
// offset of the realigned read r on the reference
|
||||||
|
int ref_offset = cons_offset + startOnRef + consToRef.getAlignmentStart2wrt1()+indelCorrection(cons_offset,consToRef.getCigar());
|
||||||
|
|
||||||
|
r.setAlignmentStart(ref_offset);
|
||||||
|
|
||||||
Cigar cig = buildCigar(cons_offset, r.getReadLength(), consToRef.getCigar());
|
Cigar cig = buildCigar(cons_offset, r.getReadLength(), consToRef.getCigar());
|
||||||
if ( cig.numCigarElements() != r.getCigar().numCigarElements() ) smashed_reads++;
|
|
||||||
SAMRecord rtest = new SAMRecord(r.getHeader());
|
|
||||||
rtest.setAlignmentStart(ref_offset);
|
|
||||||
rtest.setReadString(r.getReadString());
|
|
||||||
rtest.setReadUmappedFlag(r.getReadUnmappedFlag());
|
|
||||||
rtest.setCigar(cig);
|
|
||||||
AlignmentUtils.collectAndCountIndels(rtest,new_indels);
|
|
||||||
new_mismatches += AlignmentUtils.numMismatches(rtest,referenceSequence);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
r.setCigar(cig);
|
||||||
|
|
||||||
|
r.setAttribute("NM",new Integer(AlignmentUtils.numMismatches(r,referenceSequence)));
|
||||||
|
|
||||||
|
if ( r.getAlignmentStart() == 713655 ) {
|
||||||
|
System.out.println("!!!----> "+r.format());
|
||||||
|
System.out.println("!!!----> "+AlignmentUtils.toString(cig) +" --- " +AlignmentUtils.toString(r.getCigar()));
|
||||||
|
}
|
||||||
|
// System.out.println("writing " + id);
|
||||||
|
samWriter.addAlignment(r);
|
||||||
|
|
||||||
int discovered_indels = 0;
|
|
||||||
int discovered_support = 0;
|
|
||||||
int existing_indels = 0;
|
|
||||||
int existing_support = 0;
|
|
||||||
int existing_support_new = 0;
|
|
||||||
int discarded_indels = 0;
|
|
||||||
for ( CountedObject<Indel> ind : new_indels ) {
|
|
||||||
if ( ! all_indels.contains(ind) ) {
|
|
||||||
discovered_indels++;
|
|
||||||
discovered_support += ind.getCount();
|
|
||||||
} else {
|
|
||||||
existing_indels++;
|
|
||||||
existing_support_new += ind.getCount();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for ( CountedObject<Indel> ind : all_indels ) {
|
|
||||||
if ( ! new_indels.contains(ind )) {
|
|
||||||
discarded_indels++;
|
|
||||||
} else {
|
|
||||||
existing_support += ind.getCount();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
System.out.print("TOTAL MISMATCHES: "+totalMismatches +" --> "+new_mismatches);
|
|
||||||
if ( totalMismatches > new_mismatches ) System.out.print("(-");
|
|
||||||
else System.out.print("(+");
|
|
||||||
System.out.println(Math.abs((new_mismatches - totalMismatches)*100.0/totalMismatches)+"%)");
|
|
||||||
|
|
||||||
System.out.println("CONFIRMED INDELS: "+existing_indels);
|
|
||||||
System.out.print("CONFIRMED INDEL SUPPORT: "+existing_support + " --> " + existing_support_new );
|
|
||||||
if ( existing_support > existing_support_new ) System.out.print("(-");
|
|
||||||
else System.out.print("(+");
|
|
||||||
System.out.println(Math.abs((existing_support- existing_support_new)*100.0/existing_support)+"%)");
|
|
||||||
System.out.println("DROPPED INDELS: " + discarded_indels);
|
|
||||||
System.out.println("DISCOVERED INDELS: " + discovered_indels) ;
|
|
||||||
System.out.println("DISCOVERED INDELS SUPPORT: "+discovered_support);
|
|
||||||
System.out.println("ALIGNMENTS SHIFTED: "+shifted_reads);
|
|
||||||
System.out.println("ALIGNMENTS WITH GAPS CHANGED: "+smashed_reads);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void close() { samWriter.close(); }
|
||||||
|
|
||||||
|
public double pct (int i, int t) {
|
||||||
|
return ((double)i*100.0/((double)t));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void printStats() {
|
||||||
|
System.out.println("\n---------------------------------------------------------------------------------");
|
||||||
|
System.out.println("Piles processed: "+ processed_piles);
|
||||||
|
System.out.printf("Piles improved: %d (%.2f%%)%n", improved_piles,pct(improved_piles,processed_piles));
|
||||||
|
System.out.printf("Piles confirmed (unchanged): %d (%.2f%%)%n", unmodified_piles,pct(unmodified_piles,processed_piles));
|
||||||
|
System.out.printf("Piles failed: %d (%.2f%%)%n", failed_piles,pct(failed_piles,processed_piles));
|
||||||
|
System.out.println("In improved piles:");
|
||||||
|
System.out.printf(" Total reads: %d (%.1f per pile) with %.2f mm/read originally%n", total_reads_in_improved,
|
||||||
|
(double)total_reads_in_improved/(double)improved_piles,(double) total_mismatches_count_in_improved /(double)total_reads_in_improved);
|
||||||
|
System.out.printf(" Overall mismatch count: %d --> %d (%.2f%%)%n", total_mismatches_count_in_improved,total_improved_mismatches_count,
|
||||||
|
pct(total_improved_mismatches_count- total_mismatches_count_in_improved, total_mismatches_count_in_improved));
|
||||||
|
System.out.printf(" Mismatch improvement: suppressed %.2f mm/read%n",
|
||||||
|
(double)(total_mismatches_count_in_improved -total_improved_mismatches_count)/(double)total_reads_in_improved );
|
||||||
|
System.out.printf(" Alignments modified: %d (%.2f%% of total or %.2f per pile)%n",total_alignments_modified,
|
||||||
|
pct(total_alignments_modified,total_reads_in_improved),(double)total_alignments_modified/(double)improved_piles);
|
||||||
|
System.out.printf(" Improved indels: %d (%.2f per pile) with %.3f additional reads per indel%n",
|
||||||
|
indels_improved,(double)indels_improved/(double)improved_piles,(double)indel_improvement_cnt/(double)indels_improved);
|
||||||
|
System.out.printf(" New indels: %d (%.2f per pile) with %.3f reads per indel%n",
|
||||||
|
indels_added,(double)indels_added/(double)improved_piles,(double)indels_added_cnt/(double)indels_added);
|
||||||
|
System.out.printf(" Discarded indels: %d (%.2f per pile)%n",
|
||||||
|
indels_discarded,(double)indels_discarded/(double)improved_piles);
|
||||||
|
System.out.println("In failed piles:");
|
||||||
|
System.out.printf(" Total reads: %d (%.1f per pile) with %.2f mm/read originally%n", total_reads_in_failed,
|
||||||
|
(double)total_reads_in_failed/(double)failed_piles,(double) total_mismatches_count_in_failed /(double)total_reads_in_failed);
|
||||||
|
System.out.println("---------------------------------------------------------------------------------\n");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setVerbosity(int v) {
|
||||||
|
mVerbosityLevel = v;
|
||||||
|
}
|
||||||
/** Assuming that a read of length l has a gapless, fully consumed align starting at s (ZERO-based) to some sequence X,
|
/** Assuming that a read of length l has a gapless, fully consumed align starting at s (ZERO-based) to some sequence X,
|
||||||
* and that sequence's alignment to some reference Y is described by baseCigar, builds a cigar for the direct
|
* and that sequence's alignment to some reference Y is described by baseCigar, builds a cigar for the direct
|
||||||
* alignment of the read to Y (i.e. if the alignment of X to Y contains indel(s) and the read spans them, the
|
* alignment of the read to Y (i.e. if the alignment of X to Y contains indel(s) and the read spans them, the
|
||||||
|
|
@ -224,7 +419,7 @@ public class PileBuilder implements RecordPileReceiver {
|
||||||
int i = 0;
|
int i = 0;
|
||||||
while ( refpos <= s ) {
|
while ( refpos <= s ) {
|
||||||
celem = baseCigar.getCigarElement(i);
|
celem = baseCigar.getCigarElement(i);
|
||||||
refpos+=celem.getLength();
|
if ( celem.getOperator() != CigarOperator.D ) refpos+=celem.getLength();
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
// we now sit on cigar element that contains start s, and refpos points to the end of that element; i points to next element
|
// we now sit on cigar element that contains start s, and refpos points to the end of that element; i points to next element
|
||||||
|
|
@ -241,6 +436,24 @@ public class PileBuilder implements RecordPileReceiver {
|
||||||
return new Cigar(lce);
|
return new Cigar(lce);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private int indelCorrection(int offset, Cigar cig) {
|
||||||
|
int correction = 0;
|
||||||
|
for ( int i = 0 ; i < cig.numCigarElements() && offset > 0 ; i++ ) {
|
||||||
|
CigarElement ce = cig.getCigarElement(i);
|
||||||
|
switch ( ce.getOperator() ) {
|
||||||
|
case M: offset -= ce.getLength() ; break;
|
||||||
|
case I:
|
||||||
|
if ( offset >= ce.getLength() ) correction-= ce.getLength();
|
||||||
|
else correction -= offset;
|
||||||
|
offset -= ce.getLength();
|
||||||
|
break;
|
||||||
|
case D: correction+=ce.getLength();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return correction;
|
||||||
|
}
|
||||||
|
|
||||||
public void initPairwiseAlignments( IndexedSequence [] seqs ) {
|
public void initPairwiseAlignments( IndexedSequence [] seqs ) {
|
||||||
distances = new SymmetricMatrix( seqs.length );
|
distances = new SymmetricMatrix( seqs.length );
|
||||||
alignments = new Matrix<PairwiseAlignment>( seqs.length );
|
alignments = new Matrix<PairwiseAlignment>( seqs.length );
|
||||||
|
|
@ -517,7 +730,7 @@ public class PileBuilder implements RecordPileReceiver {
|
||||||
*/
|
*/
|
||||||
public double diameter(MultipleAlignment a) {
|
public double diameter(MultipleAlignment a) {
|
||||||
double dmaxmin = 0.0;
|
double dmaxmin = 0.0;
|
||||||
System.out.print("\n[");
|
if ( mVerbosityLevel >= PILESUMMARY ) System.out.print("\nclosest neighbor for each seq: [");
|
||||||
Iterator<Integer> ids1 = a.sequenceIdByOffsetIterator();
|
Iterator<Integer> ids1 = a.sequenceIdByOffsetIterator();
|
||||||
while ( ids1.hasNext() ) {
|
while ( ids1.hasNext() ) {
|
||||||
Integer id1 = ids1.next();
|
Integer id1 = ids1.next();
|
||||||
|
|
@ -528,10 +741,10 @@ public class PileBuilder implements RecordPileReceiver {
|
||||||
d = Math.min(d,dpair);
|
d = Math.min(d,dpair);
|
||||||
}
|
}
|
||||||
// d = distance from id1 to its closest neighbor within the pile
|
// d = distance from id1 to its closest neighbor within the pile
|
||||||
if ( d < 1e99 ) System.out.printf("%8.4g",d);
|
if ( d < 1e99 && mVerbosityLevel >= PILESUMMARY ) System.out.printf("%8.4g",d);
|
||||||
if ( d < 1e99 && d > dmaxmin ) dmaxmin = d;
|
if ( d < 1e99 && d > dmaxmin ) dmaxmin = d;
|
||||||
}
|
}
|
||||||
System.out.println(" ]");
|
if ( mVerbosityLevel >= PILESUMMARY ) System.out.println(" ]");
|
||||||
// dmaxmin = the largest distance from a sequence in this pile to its closest neighbor
|
// dmaxmin = the largest distance from a sequence in this pile to its closest neighbor
|
||||||
// System.out.println();
|
// System.out.println();
|
||||||
return dmaxmin;
|
return dmaxmin;
|
||||||
|
|
@ -545,7 +758,7 @@ public class PileBuilder implements RecordPileReceiver {
|
||||||
// IndexedSequence [] seqs = testSet3(K); // initialize test set data
|
// IndexedSequence [] seqs = testSet3(K); // initialize test set data
|
||||||
IndexedSequence [] seqs = testSet4(K); // initialize test set data
|
IndexedSequence [] seqs = testSet4(K); // initialize test set data
|
||||||
|
|
||||||
PileBuilder pb = new PileBuilder();
|
PileBuilder pb = new PileBuilder("test1.bam",null,new DiscardingReceiver());
|
||||||
|
|
||||||
//pb.doMultipleAlignment(seqs);
|
//pb.doMultipleAlignment(seqs);
|
||||||
pb.doMultipleAlignment2(seqs);
|
pb.doMultipleAlignment2(seqs);
|
||||||
|
|
|
||||||
|
|
@ -91,6 +91,8 @@ public class SWPairwiseAlignment {
|
||||||
|
|
||||||
PrimitivePair.Int p = new PrimitivePair.Int();
|
PrimitivePair.Int p = new PrimitivePair.Int();
|
||||||
int maxscore = 0;
|
int maxscore = 0;
|
||||||
|
int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
|
||||||
|
|
||||||
// look for largest score. we use >= combined with the traversal direction
|
// look for largest score. we use >= combined with the traversal direction
|
||||||
// to ensure that if two scores are equal, the one closer to diagonal gets picked
|
// to ensure that if two scores are equal, the one closer to diagonal gets picked
|
||||||
for ( int i = 1 ; i < n+1 ; i++ ) {
|
for ( int i = 1 ; i < n+1 ; i++ ) {
|
||||||
|
|
@ -102,6 +104,7 @@ public class SWPairwiseAlignment {
|
||||||
p.first = n;
|
p.first = n;
|
||||||
p.second = j ;
|
p.second = j ;
|
||||||
maxscore = sw[n][j];
|
maxscore = sw[n][j];
|
||||||
|
segment_length = m - j; // end of sequence 2 is overhanging; we will just record it as 'M' segment
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -114,7 +117,6 @@ public class SWPairwiseAlignment {
|
||||||
// to that sequence
|
// to that sequence
|
||||||
|
|
||||||
int state = MSTATE;
|
int state = MSTATE;
|
||||||
int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
|
|
||||||
|
|
||||||
int [] scores = new int[3];
|
int [] scores = new int[3];
|
||||||
|
|
||||||
|
|
@ -160,12 +162,12 @@ public class SWPairwiseAlignment {
|
||||||
case DSTATE: o = CigarOperator.D; break;
|
case DSTATE: o = CigarOperator.D; break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
alignment_offset = p.first - p.second;
|
||||||
segment_length+=p.second;
|
segment_length+=p.second;
|
||||||
CigarElement e = new CigarElement(segment_length,o);
|
CigarElement e = new CigarElement(segment_length,o);
|
||||||
lce.add(e);
|
lce.add(e);
|
||||||
Collections.reverse(lce);
|
Collections.reverse(lce);
|
||||||
alignmentCigar = new Cigar(lce);
|
alignmentCigar = new Cigar(lce);
|
||||||
alignment_offset = p.first - p.second;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Allows for separate gap opening end extension penalties, no direct backtracking.
|
/** Allows for separate gap opening end extension penalties, no direct backtracking.
|
||||||
|
|
@ -198,6 +200,8 @@ public class SWPairwiseAlignment {
|
||||||
|
|
||||||
PrimitivePair.Int p = new PrimitivePair.Int();
|
PrimitivePair.Int p = new PrimitivePair.Int();
|
||||||
double maxscore = 0.0;
|
double maxscore = 0.0;
|
||||||
|
int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
|
||||||
|
|
||||||
// look for largest score. we use >= combined with the traversal direction
|
// look for largest score. we use >= combined with the traversal direction
|
||||||
// to ensure that if two scores are equal, the one closer to diagonal gets picked
|
// to ensure that if two scores are equal, the one closer to diagonal gets picked
|
||||||
for ( int i = 1 ; i < n+1 ; i++ ) {
|
for ( int i = 1 ; i < n+1 ; i++ ) {
|
||||||
|
|
@ -209,6 +213,7 @@ public class SWPairwiseAlignment {
|
||||||
p.first = n;
|
p.first = n;
|
||||||
p.second = j ;
|
p.second = j ;
|
||||||
maxscore = sw[n][j];
|
maxscore = sw[n][j];
|
||||||
|
segment_length = m - j; // end of sequence 2 is overhanging; we will just record it as 'M' segment
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -221,7 +226,6 @@ public class SWPairwiseAlignment {
|
||||||
// to that sequence
|
// to that sequence
|
||||||
|
|
||||||
int state = MSTATE;
|
int state = MSTATE;
|
||||||
int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
|
|
||||||
|
|
||||||
double [] scores = new double[3];
|
double [] scores = new double[3];
|
||||||
|
|
||||||
|
|
@ -268,12 +272,12 @@ public class SWPairwiseAlignment {
|
||||||
case ISTATE: o = CigarOperator.I; break;
|
case ISTATE: o = CigarOperator.I; break;
|
||||||
case DSTATE: o = CigarOperator.D; break;
|
case DSTATE: o = CigarOperator.D; break;
|
||||||
}
|
}
|
||||||
|
alignment_offset = p.first - p.second;
|
||||||
segment_length+=p.second;
|
segment_length+=p.second;
|
||||||
CigarElement e = new CigarElement(segment_length,o);
|
CigarElement e = new CigarElement(segment_length,o);
|
||||||
lce.add(e);
|
lce.add(e);
|
||||||
Collections.reverse(lce);
|
Collections.reverse(lce);
|
||||||
alignmentCigar = new Cigar(lce);
|
alignmentCigar = new Cigar(lce);
|
||||||
alignment_offset = p.first - p.second ;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -342,6 +346,8 @@ public void align3(String a, String b) {
|
||||||
|
|
||||||
PrimitivePair.Int p = new PrimitivePair.Int();
|
PrimitivePair.Int p = new PrimitivePair.Int();
|
||||||
double maxscore = 0.0;
|
double maxscore = 0.0;
|
||||||
|
int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
|
||||||
|
|
||||||
// look for largest score. we use >= combined with the traversal direction
|
// look for largest score. we use >= combined with the traversal direction
|
||||||
// to ensure that if two scores are equal, the one closer to diagonal gets picked
|
// to ensure that if two scores are equal, the one closer to diagonal gets picked
|
||||||
for ( int i = 1 ; i < n+1 ; i++ ) {
|
for ( int i = 1 ; i < n+1 ; i++ ) {
|
||||||
|
|
@ -353,6 +359,7 @@ public void align3(String a, String b) {
|
||||||
p.first = n;
|
p.first = n;
|
||||||
p.second = j ;
|
p.second = j ;
|
||||||
maxscore = sw[n][j];
|
maxscore = sw[n][j];
|
||||||
|
segment_length = m - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -365,7 +372,6 @@ public void align3(String a, String b) {
|
||||||
// to that sequence
|
// to that sequence
|
||||||
|
|
||||||
int state = MSTATE;
|
int state = MSTATE;
|
||||||
int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
|
|
||||||
|
|
||||||
double [] scores = new double[3];
|
double [] scores = new double[3];
|
||||||
|
|
||||||
|
|
@ -419,12 +425,12 @@ public void align3(String a, String b) {
|
||||||
case ISTATE: o = CigarOperator.I; break;
|
case ISTATE: o = CigarOperator.I; break;
|
||||||
case DSTATE: o = CigarOperator.D; break;
|
case DSTATE: o = CigarOperator.D; break;
|
||||||
}
|
}
|
||||||
|
alignment_offset = p.first - p.second;
|
||||||
segment_length+=p.second;
|
segment_length+=p.second;
|
||||||
CigarElement e = new CigarElement(segment_length,o);
|
CigarElement e = new CigarElement(segment_length,o);
|
||||||
lce.add(e);
|
lce.add(e);
|
||||||
Collections.reverse(lce);
|
Collections.reverse(lce);
|
||||||
alignmentCigar = new Cigar(lce);
|
alignmentCigar = new Cigar(lce);
|
||||||
alignment_offset = p.first - p.second;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void align4(String a, String b) {
|
public void align4(String a, String b) {
|
||||||
|
|
@ -485,6 +491,8 @@ public void align3(String a, String b) {
|
||||||
|
|
||||||
PrimitivePair.Int p = new PrimitivePair.Int();
|
PrimitivePair.Int p = new PrimitivePair.Int();
|
||||||
double maxscore = 0.0;
|
double maxscore = 0.0;
|
||||||
|
int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
|
||||||
|
|
||||||
// look for largest score. we use >= combined with the traversal direction
|
// look for largest score. we use >= combined with the traversal direction
|
||||||
// to ensure that if two scores are equal, the one closer to diagonal gets picked
|
// to ensure that if two scores are equal, the one closer to diagonal gets picked
|
||||||
for ( int i = 1 ; i < n+1 ; i++ ) {
|
for ( int i = 1 ; i < n+1 ; i++ ) {
|
||||||
|
|
@ -496,6 +504,7 @@ public void align3(String a, String b) {
|
||||||
p.first = n;
|
p.first = n;
|
||||||
p.second = j ;
|
p.second = j ;
|
||||||
maxscore = sw[n][j];
|
maxscore = sw[n][j];
|
||||||
|
segment_length = m - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -508,7 +517,6 @@ public void align3(String a, String b) {
|
||||||
// to that sequence
|
// to that sequence
|
||||||
|
|
||||||
int state = MSTATE;
|
int state = MSTATE;
|
||||||
int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
|
|
||||||
|
|
||||||
double [] scores = new double[3];
|
double [] scores = new double[3];
|
||||||
|
|
||||||
|
|
@ -558,12 +566,12 @@ public void align3(String a, String b) {
|
||||||
case ISTATE: o = CigarOperator.I; break;
|
case ISTATE: o = CigarOperator.I; break;
|
||||||
case DSTATE: o = CigarOperator.D; break;
|
case DSTATE: o = CigarOperator.D; break;
|
||||||
}
|
}
|
||||||
|
alignment_offset = p.first - p.second;
|
||||||
segment_length+=p.second;
|
segment_length+=p.second;
|
||||||
CigarElement e = new CigarElement(segment_length,o);
|
CigarElement e = new CigarElement(segment_length,o);
|
||||||
lce.add(e);
|
lce.add(e);
|
||||||
Collections.reverse(lce);
|
Collections.reverse(lce);
|
||||||
alignmentCigar = new Cigar(lce);
|
alignmentCigar = new Cigar(lce);
|
||||||
alignment_offset = p.first - p.second;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private int w(char x, char y) {
|
private int w(char x, char y) {
|
||||||
|
|
@ -688,17 +696,21 @@ public void align3(String a, String b) {
|
||||||
}
|
}
|
||||||
// now pos1 = alignment_offset
|
// now pos1 = alignment_offset
|
||||||
}
|
}
|
||||||
System.out.println(AlignmentUtils.toString(getCigar()));
|
/* debug prints: */
|
||||||
System.out.println("seq1l="+s1.length()+"; seq2l=" + s2.length());
|
// System.out.println(AlignmentUtils.toString(getCigar()));
|
||||||
System.out.println("offset="+alignment_offset);
|
// System.out.println("seq1l="+s1.length()+"; seq2l=" + s2.length());
|
||||||
|
// System.out.println("offset="+alignment_offset);
|
||||||
try {
|
// System.out.println("pos1="+pos1+"; pos2=" + pos2);
|
||||||
System.out.println("pos1="+pos1+"; pos2=" + pos2);
|
/**/
|
||||||
for ( int i = 0 ; i < getCigar().numCigarElements() ; i++ ) {
|
for ( int i = 0 ; i < getCigar().numCigarElements() ; i++ ) {
|
||||||
CigarElement ce = getCigar().getCigarElement(i) ;
|
CigarElement ce = getCigar().getCigarElement(i) ;
|
||||||
switch( ce.getOperator() ) {
|
switch( ce.getOperator() ) {
|
||||||
case M:
|
case M:
|
||||||
for ( int k = 0 ; k < ce.getLength() ; k++ ) {
|
int z = ( i == 0 ? pos2 : 0); // if we are in the first element and seq overhangs to the left,
|
||||||
|
// start inside the first segment, at the position where actual matches begin
|
||||||
|
// check separately for pos1 < s1.length() since seq2 is allowed to overhang beyond seq1's end
|
||||||
|
for ( ; z < ce.getLength() && pos1 < s1.length() ; z++ ) {
|
||||||
|
// System.out.println("pos1="+pos1+"; pos2="+pos2+"; k="+z);
|
||||||
if ( Character.toUpperCase(s1.charAt(pos1)) !=
|
if ( Character.toUpperCase(s1.charAt(pos1)) !=
|
||||||
Character.toUpperCase(s2.charAt(pos2)) ) bmm.append('*');
|
Character.toUpperCase(s2.charAt(pos2)) ) bmm.append('*');
|
||||||
else bmm.append(' ');
|
else bmm.append(' ');
|
||||||
|
|
@ -722,10 +734,7 @@ public void align3(String a, String b) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch(Exception e) {}
|
|
||||||
b1.append("<---");
|
|
||||||
b2.append("<---");
|
|
||||||
bmm.append("<---");
|
|
||||||
bmm.append('\n');
|
bmm.append('\n');
|
||||||
b1.append(s1,pos1,s1.length());
|
b1.append(s1,pos1,s1.length());
|
||||||
bmm.append(b1);
|
bmm.append(b1);
|
||||||
|
|
@ -733,6 +742,7 @@ public void align3(String a, String b) {
|
||||||
b2.append(s2,pos2,s2.length());
|
b2.append(s2,pos2,s2.length());
|
||||||
bmm.append(b2);
|
bmm.append(b2);
|
||||||
bmm.append('\n');
|
bmm.append('\n');
|
||||||
|
|
||||||
return bmm.toString();
|
return bmm.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -231,7 +231,7 @@ public class SequencePile {
|
||||||
if ( refC == '+' ) {
|
if ( refC == '+' ) {
|
||||||
// count number of observations for insertion
|
// count number of observations for insertion
|
||||||
for ( int j = 0 ; j < col.size() ; j++ ) {
|
for ( int j = 0 ; j < col.size() ; j++ ) {
|
||||||
if ( col.charAt(j) != '*' ) count++;
|
if ( col.charAt(j) != '*' && col.charAt(j) != ' ') count++;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if ( headerGrid.charAt(i) == '-' ) {
|
if ( headerGrid.charAt(i) == '-' ) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue