diff --git a/playground/java/src/org/broadinstitute/sting/indels/IndelInspector.java b/playground/java/src/org/broadinstitute/sting/indels/IndelInspector.java index 7abe7cd24..fdf50c55e 100755 --- a/playground/java/src/org/broadinstitute/sting/indels/IndelInspector.java +++ b/playground/java/src/org/broadinstitute/sting/indels/IndelInspector.java @@ -36,6 +36,7 @@ public class IndelInspector extends CommandLineProgram { protected int doWork() { + System.out.println("I am at version 0.1"); GenomeLoc location = null; if ( GENOME_LOCATION != null ) { location = GenomeLoc.parseGenomeLoc(GENOME_LOCATION); @@ -59,7 +60,7 @@ public class IndelInspector extends CommandLineProgram { IndelRecordPileCollector col = null; try { - col = new IndelRecordPileCollector(); + col = new IndelRecordPileCollector(new DiscardingReceiver(), new PileBuilder() ); } catch(Exception e) { System.err.println(e.getMessage()); } if ( col == null ) return 1; @@ -105,8 +106,8 @@ public class IndelInspector extends CommandLineProgram { else if ( ERR_MODE.equals("ERR")) err = numErrors(r); else if ( ERR_MODE.equals("MG")) err = numMismatchesGaps(r); if ( err > MAX_ERRS.intValue() ) continue; - counter++; - if ( counter % 1000000 == 0 ) System.out.println(counter+" records; "+col.memStatsString()); +// counter++; +// if ( counter % 1000000 == 0 ) System.out.println(counter+" records; "+col.memStatsString()); col.receive(r); } diff --git a/playground/java/src/org/broadinstitute/sting/indels/IndelRecordPileCollector.java b/playground/java/src/org/broadinstitute/sting/indels/IndelRecordPileCollector.java index c07ab1056..78e6e19c5 100755 --- a/playground/java/src/org/broadinstitute/sting/indels/IndelRecordPileCollector.java +++ b/playground/java/src/org/broadinstitute/sting/indels/IndelRecordPileCollector.java @@ -86,16 +86,16 @@ public class IndelRecordPileCollector implements RecordReceiver { private List mIndelLengthHistI; private List mIndelLengthHistD; - private RecordReceiver nonindelReceiver; // we will send there records that do not overlap with regions of interest + private RecordReceiver defaultReceiver; // we will send there records that do not overlap with regions of interest private RecordPileReceiver indelPileReceiver; // piles over indel regions will be sent there - public String memStatsString() { + public String memStatsString() { String s = "mRecordPile: "; return s+mRecordPile.size() + " mAllIndels: "+mAllIndels.size() + " mLastContig=" +mLastContig + " mLastStartOnref="+mLastStartOnRef; //+" Bndries="+mIndelRegionStart +":"+ mIndelRegionStop; } - public IndelRecordPileCollector() throws java.io.IOException { + public IndelRecordPileCollector(RecordReceiver rr, RecordPileReceiver rp) throws java.io.IOException { mRecordPile = new LinkedList(); mAllIndels = new TreeSet >( new CountedObjectComparatorAdapter(new IntervalComparator())); @@ -108,8 +108,8 @@ public class IndelRecordPileCollector implements RecordReceiver { mIndelLengthHistI.add(0); mIndelLengthHistD.add(0); } - nonindelReceiver = new DiscardingReceiver(); - indelPileReceiver = new DiscardingPileReceiver(); + defaultReceiver = rr; + indelPileReceiver = rp; setWaitState(); } @@ -138,7 +138,7 @@ public class IndelRecordPileCollector implements RecordReceiver { while ( i.hasNext() ) { SAMRecord r = i.next(); if ( r.getAlignmentEnd() <= pos ) { - nonindelReceiver.receive(r); + defaultReceiver.receive(r); i.remove(); } else break; } @@ -153,7 +153,7 @@ public class IndelRecordPileCollector implements RecordReceiver { while ( i.hasNext() ) { SAMRecord r = i.next(); if ( r.getAlignmentStart() >= pos ) { - nonindelReceiver.receive(r); + defaultReceiver.receive(r); i.remove(); } else break; } @@ -243,8 +243,8 @@ public class IndelRecordPileCollector implements RecordReceiver { " bases between indels"); } - // no indels or avoiding indels in bad region: send all records to nonindelReceiver and clear the pile - for ( SAMRecord r : mRecordPile ) nonindelReceiver.receive(r); + // no indels or avoiding indels in bad region: send all records to defaultReceiver and clear the pile + for ( SAMRecord r : mRecordPile ) defaultReceiver.receive(r); setWaitState(); return; } @@ -277,7 +277,7 @@ public class IndelRecordPileCollector implements RecordReceiver { while ( indel != null ) { - // first, if we just started new indel train, then emit into nonindelReceiver all alignments + // first, if we just started new indel train, then emit into defaultReceiver all alignments // that end prior to the first indel in the train: if ( finalTrain.size() == 0 ) purgeRecordsEndingAtOrBefore(indel.getObject().getStart() - 1); @@ -315,8 +315,14 @@ public class IndelRecordPileCollector implements RecordReceiver { finalTrain.size() + " indels"); System.out.println(formatRange(finalTrain)); - if ( shouldAcceptForOutput(finalTrain ) ) indelPileReceiver.receive(finalPile); - else for ( SAMRecord r : finalPile ) nonindelReceiver.receive(r); + if ( shouldAcceptForOutput(finalTrain ) ) { + System.out.print(mLastContig+":"+ finalTrain.get(0).getObject().getStart() + "-" + + finalTrain.get(finalTrain.size()-1).getObject().getStop() + " " + + finalTrain.size() + " indels; "); + System.out.print(finalPile.size() + " reads in the pile;") ; + System.out.println(formatRange(finalTrain)); + indelPileReceiver.receive(finalPile); + } else for ( SAMRecord r : finalPile ) defaultReceiver.receive(r); finalPile.clear(); finalTrain.clear(); curr_stop = -1; @@ -444,7 +450,8 @@ public class IndelRecordPileCollector implements RecordReceiver { long min = 1000000000; long max = 0; - + + all.append("passing indels:"); for ( CountedObject o : indels ) { if ( o.getCount() < 2 ) continue; all.append(" "); @@ -452,9 +459,9 @@ public class IndelRecordPileCollector implements RecordReceiver { if ( o.getObject().getIndelLength() < min ) min = o.getObject().getIndelLength(); if ( o.getObject().getIndelLength() > max ) max = o.getObject().getIndelLength(); } - b.append(" min: "); + b.append("; passing min length: "); b.append(min); - b.append(" max: "); + b.append("; passing max length: "); b.append(max); b.append(all); return b.toString(); diff --git a/playground/java/src/org/broadinstitute/sting/indels/PileBuilder.java b/playground/java/src/org/broadinstitute/sting/indels/PileBuilder.java index f853e767b..16686f5bd 100755 --- a/playground/java/src/org/broadinstitute/sting/indels/PileBuilder.java +++ b/playground/java/src/org/broadinstitute/sting/indels/PileBuilder.java @@ -1,11 +1,18 @@ package org.broadinstitute.sting.indels; +import net.sf.samtools.SAMRecord; -public class PileBuilder { +import java.util.Collection; + + +public class PileBuilder implements RecordPileReceiver { private StrictlyUpperTriangularMatrix distances ; private Matrix alignments ; - - private static class SelectedPair { + private static final int KmerSize = 8; + private MultipleAlignment alignments1; + private MultipleAlignment alignments2; + + private static class SelectedPair { private int i_; private int j_; private double d_; @@ -40,7 +47,23 @@ public class PileBuilder { } public PileBuilder() {} - + + public void receive(Collection c) { + IndexedSequence[] seqs = new IndexedSequence[c.size()]; + int i = 0; + for ( SAMRecord r : c ) { + seqs[i++] = new IndexedSequence(r.getReadString(),KmerSize); + } + doMultipleAlignment(seqs); + System.out.print("Distance between final piles: "+distance(alignments1, alignments2)); + System.out.println("; diameter of PILE1: "+ diameter(alignments1)); + System.out.println("; diameter of PILE2: "+ diameter(alignments2)); + + System.out.println("PILE 1: \n"+alignments1.toString()); + System.out.println("PILE 2: \n"+alignments2.toString()); + + } + public void initPairwiseAlignments( IndexedSequence [] seqs ) { distances = new StrictlyUpperTriangularMatrix( seqs.length ); alignments = new Matrix( seqs.length ); @@ -57,7 +80,7 @@ public class PileBuilder { * so that the alignments are pre-computed. * @return id's of the two sequences and the distance between them in a compound object. */ - public SelectedPair findBest() { + public SelectedPair findClosestPair() { SelectedPair p = new SelectedPair(-1,-1,1e100); @@ -95,7 +118,7 @@ public class PileBuilder { * @param pexclude neither of the two sequences in the returned pair can have its id listed in pexclude pair. * @return Best pairwise alignment excluding alignments between pairs involving at least one sequence from pexclude */ - public SelectedPair findBest(SelectedPair pexclude) { + public SelectedPair findNextClosestPairAfter(SelectedPair pexclude) { SelectedPair p = new SelectedPair(-1,-1,1e100); @@ -117,7 +140,7 @@ public class PileBuilder { * @return a compound SelectedPair object that contains the index of the closest sequence found (is guaranteed to * be not in the pile), the index of the sequence in the pile it is closest to, and the actual distance between the two. */ - public SelectedPair findClosest(MultipleAlignment a) { + public SelectedPair findClosestToPile(MultipleAlignment a) { SelectedPair p = new SelectedPair(-1,-1,1e100); @@ -143,7 +166,7 @@ public class PileBuilder { * @param a alignment pile to find the closest sequence for * @return index of the farthest sequence */ - public int findFarthest(MultipleAlignment a) { + public int findFarthestFromPile(MultipleAlignment a) { double dmaxmin = 0; int i_out = -1; @@ -209,17 +232,22 @@ public class PileBuilder { IndexedSequence [] seqs = testSet3(K); // initialize test set data PileBuilder pb = new PileBuilder(); + + pb.doMultipleAlignment(seqs); + } + + public void doMultipleAlignment(IndexedSequence[] seqs) { // two piles we are going to grow until all sequences are assigned to one of them. // we intend to keep the piles disjoint, e.g. no sequence should be placed in both MultipleAlignment pile1 = new MultipleAlignment(); MultipleAlignment pile2 = new MultipleAlignment(); - pb.initPairwiseAlignments(seqs); + initPairwiseAlignments(seqs); // all the pairwise alignments are computed and disjoint best and next-best pairs are found - System.out.println( pb.distances.format("%8.4g ")); + //System.out.println( distances.format("%8.4g ")); /* SelectedPair pworst = pb.findWorst(); @@ -229,10 +257,10 @@ public class PileBuilder { */ // initialize piles with best and next-best pairs - SelectedPair p_best = pb.findBest(); - SelectedPair p_nextbest = pb.findBest(p_best); - pile1.add( pb.alignments.get(p_best.i(), p_best.j()),p_best.i(),p_best.j()); - pile2.add( pb.alignments.get(p_nextbest.i(), p_nextbest.j()),p_nextbest.i(),p_nextbest.j()); + SelectedPair p_best = findClosestPair(); + SelectedPair p_nextbest = findNextClosestPairAfter(p_best); + pile1.add( alignments.get(p_best.i(), p_best.j()),p_best.i(),p_best.j()); + pile2.add( alignments.get(p_nextbest.i(), p_nextbest.j()),p_nextbest.i(),p_nextbest.j()); /* System.out.println("Best pair ("+p_best.i() + "," + p_best.j()+", d="+p_best.d()+"):"); System.out.println(pile1.toString()); @@ -245,42 +273,39 @@ public class PileBuilder { // grow piles hierarchical clustering-style while ( pile1.size() + pile2.size() < seqs.length ) { // find candidate sequences closest to each of the two piles - p1 = pb.findClosest(pile1); - p2 = pb.findClosest(pile2); + p1 = findClosestToPile(pile1); + p2 = findClosestToPile(pile2); int id1_cand = pile1.selectExternal(p1.i(), p1.j()); // id of the sequence closest to the pile 1 int id2_cand = pile2.selectExternal(p2.i(), p2.j()); // id of the sequence closest to the pile 2 if ( pile2.contains(id1_cand) && pile1.contains(id2_cand)) { // pile1 and pile 2 are mutually the closest, so we need to merge them. // if piles are mutually the closest, then p1 and p2 are the same pair (id1, id2), // so we just merge on one of the (redundant) instances: - pile1.add(pile2, pb.alignments.get( p1.i(), p1.j()),p1.i(), p1.j()); + pile1.add(pile2, alignments.get( p1.i(), p1.j()),p1.i(), p1.j()); pile2.clear(); // need to reset pile 2 to something else - int z = pb.findFarthest(pile1); // get sequence farthest from merged pile 1 + int z = findFarthestFromPile(pile1); // get sequence farthest from merged pile 1 pile2.add(seqs[z].getSequence(), z); // and reinitialize pile 2 with that sequence } else { if ( p1.d() < p2.d() ) { if ( pile2.contains(id1_cand) ) { - pile1.add(pile2, pb.alignments.get( p1.i(), p1.j()),p1.i(), p1.j()); + pile1.add(pile2, alignments.get( p1.i(), p1.j()),p1.i(), p1.j()); pile2.clear(); // need to reset pile 2 to something else - int z = pb.findFarthest(pile1); // get sequence farthest from merged pile 1 + int z = findFarthestFromPile(pile1); // get sequence farthest from merged pile 1 pile2.add(seqs[z].getSequence(), z); // and reinitialize pile 2 with that sequence - } else pile1.add( pb.alignments.get(p1.i(), p1.j()) ); + } else pile1.add( alignments.get(p1.i(), p1.j()) ); } else { if ( pile1.contains(id2_cand) ) { - pile2.add(pile1, pb.alignments.get( p2.i(), p2.j()),p2.i(), p2.j()); + pile2.add(pile1, alignments.get( p2.i(), p2.j()),p2.i(), p2.j()); pile1.clear(); // need to reset pile 2 to something else - int z = pb.findFarthest(pile2); // get sequence farthest from merged pile 1 + int z = findFarthestFromPile(pile2); // get sequence farthest from merged pile 1 pile1.add(seqs[z].getSequence(), z); // and reinitialize pile 2 with that sequence - } else pile2.add( pb.alignments.get(p2.i(), p2.j()) ); + } else pile2.add( alignments.get(p2.i(), p2.j()) ); } } - System.out.println("PILE 1: \n"+pile1.toString()); - System.out.println("PILE 2: \n"+pile2.toString()); } // end WHILE - System.out.println("Distance between final piles: "+pb.distance(pile1, pile2)); - System.out.println("Diameter of PILE1: "+ pb.diameter(pile1)); - System.out.println("Diameter of PILE2: "+ pb.diameter(pile2)); + alignments1 = pile1; + alignments2 = pile2; /* * System.out.println("Closest distance to the pile: " + best_d + "(adding: " + best_i + "," + best_j + "):");