git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@114 348d0f76-0448-11de-a6fe-93d51630548a

This commit is contained in:
asivache 2009-03-20 15:54:29 +00:00
parent 322f4b944f
commit 9ec96414c7
3 changed files with 80 additions and 47 deletions

View File

@ -36,6 +36,7 @@ public class IndelInspector extends CommandLineProgram {
protected int doWork() { protected int doWork() {
System.out.println("I am at version 0.1");
GenomeLoc location = null; GenomeLoc location = null;
if ( GENOME_LOCATION != null ) { if ( GENOME_LOCATION != null ) {
location = GenomeLoc.parseGenomeLoc(GENOME_LOCATION); location = GenomeLoc.parseGenomeLoc(GENOME_LOCATION);
@ -59,7 +60,7 @@ public class IndelInspector extends CommandLineProgram {
IndelRecordPileCollector col = null; IndelRecordPileCollector col = null;
try { try {
col = new IndelRecordPileCollector(); col = new IndelRecordPileCollector(new DiscardingReceiver(), new PileBuilder() );
} catch(Exception e) { System.err.println(e.getMessage()); } } catch(Exception e) { System.err.println(e.getMessage()); }
if ( col == null ) return 1; if ( col == null ) return 1;
@ -105,8 +106,8 @@ public class IndelInspector extends CommandLineProgram {
else if ( ERR_MODE.equals("ERR")) err = numErrors(r); else if ( ERR_MODE.equals("ERR")) err = numErrors(r);
else if ( ERR_MODE.equals("MG")) err = numMismatchesGaps(r); else if ( ERR_MODE.equals("MG")) err = numMismatchesGaps(r);
if ( err > MAX_ERRS.intValue() ) continue; if ( err > MAX_ERRS.intValue() ) continue;
counter++; // counter++;
if ( counter % 1000000 == 0 ) System.out.println(counter+" records; "+col.memStatsString()); // if ( counter % 1000000 == 0 ) System.out.println(counter+" records; "+col.memStatsString());
col.receive(r); col.receive(r);
} }

View File

@ -86,16 +86,16 @@ public class IndelRecordPileCollector implements RecordReceiver {
private List<Integer> mIndelLengthHistI; private List<Integer> mIndelLengthHistI;
private List<Integer> mIndelLengthHistD; private List<Integer> mIndelLengthHistD;
private RecordReceiver nonindelReceiver; // we will send there records that do not overlap with regions of interest private RecordReceiver defaultReceiver; // we will send there records that do not overlap with regions of interest
private RecordPileReceiver indelPileReceiver; // piles over indel regions will be sent there private RecordPileReceiver indelPileReceiver; // piles over indel regions will be sent there
public String memStatsString() { public String memStatsString() {
String s = "mRecordPile: "; String s = "mRecordPile: ";
return s+mRecordPile.size() + " mAllIndels: "+mAllIndels.size() + " mLastContig=" +mLastContig + " mLastStartOnref="+mLastStartOnRef; return s+mRecordPile.size() + " mAllIndels: "+mAllIndels.size() + " mLastContig=" +mLastContig + " mLastStartOnref="+mLastStartOnRef;
//+" Bndries="+mIndelRegionStart +":"+ mIndelRegionStop; //+" Bndries="+mIndelRegionStart +":"+ mIndelRegionStop;
} }
public IndelRecordPileCollector() throws java.io.IOException { public IndelRecordPileCollector(RecordReceiver rr, RecordPileReceiver rp) throws java.io.IOException {
mRecordPile = new LinkedList<SAMRecord>(); mRecordPile = new LinkedList<SAMRecord>();
mAllIndels = new TreeSet<CountedObject<Indel> >( mAllIndels = new TreeSet<CountedObject<Indel> >(
new CountedObjectComparatorAdapter<Indel>(new IntervalComparator())); new CountedObjectComparatorAdapter<Indel>(new IntervalComparator()));
@ -108,8 +108,8 @@ public class IndelRecordPileCollector implements RecordReceiver {
mIndelLengthHistI.add(0); mIndelLengthHistI.add(0);
mIndelLengthHistD.add(0); mIndelLengthHistD.add(0);
} }
nonindelReceiver = new DiscardingReceiver(); defaultReceiver = rr;
indelPileReceiver = new DiscardingPileReceiver(); indelPileReceiver = rp;
setWaitState(); setWaitState();
} }
@ -138,7 +138,7 @@ public class IndelRecordPileCollector implements RecordReceiver {
while ( i.hasNext() ) { while ( i.hasNext() ) {
SAMRecord r = i.next(); SAMRecord r = i.next();
if ( r.getAlignmentEnd() <= pos ) { if ( r.getAlignmentEnd() <= pos ) {
nonindelReceiver.receive(r); defaultReceiver.receive(r);
i.remove(); i.remove();
} else break; } else break;
} }
@ -153,7 +153,7 @@ public class IndelRecordPileCollector implements RecordReceiver {
while ( i.hasNext() ) { while ( i.hasNext() ) {
SAMRecord r = i.next(); SAMRecord r = i.next();
if ( r.getAlignmentStart() >= pos ) { if ( r.getAlignmentStart() >= pos ) {
nonindelReceiver.receive(r); defaultReceiver.receive(r);
i.remove(); i.remove();
} else break; } else break;
} }
@ -243,8 +243,8 @@ public class IndelRecordPileCollector implements RecordReceiver {
" bases between indels"); " bases between indels");
} }
// no indels or avoiding indels in bad region: send all records to nonindelReceiver and clear the pile // no indels or avoiding indels in bad region: send all records to defaultReceiver and clear the pile
for ( SAMRecord r : mRecordPile ) nonindelReceiver.receive(r); for ( SAMRecord r : mRecordPile ) defaultReceiver.receive(r);
setWaitState(); setWaitState();
return; return;
} }
@ -277,7 +277,7 @@ public class IndelRecordPileCollector implements RecordReceiver {
while ( indel != null ) { while ( indel != null ) {
// first, if we just started new indel train, then emit into nonindelReceiver all alignments // first, if we just started new indel train, then emit into defaultReceiver all alignments
// that end prior to the first indel in the train: // that end prior to the first indel in the train:
if ( finalTrain.size() == 0 ) purgeRecordsEndingAtOrBefore(indel.getObject().getStart() - 1); if ( finalTrain.size() == 0 ) purgeRecordsEndingAtOrBefore(indel.getObject().getStart() - 1);
@ -315,8 +315,14 @@ public class IndelRecordPileCollector implements RecordReceiver {
finalTrain.size() + " indels"); finalTrain.size() + " indels");
System.out.println(formatRange(finalTrain)); System.out.println(formatRange(finalTrain));
if ( shouldAcceptForOutput(finalTrain ) ) indelPileReceiver.receive(finalPile); if ( shouldAcceptForOutput(finalTrain ) ) {
else for ( SAMRecord r : finalPile ) nonindelReceiver.receive(r); System.out.print(mLastContig+":"+ finalTrain.get(0).getObject().getStart() + "-" +
finalTrain.get(finalTrain.size()-1).getObject().getStop() + " " +
finalTrain.size() + " indels; ");
System.out.print(finalPile.size() + " reads in the pile;") ;
System.out.println(formatRange(finalTrain));
indelPileReceiver.receive(finalPile);
} else for ( SAMRecord r : finalPile ) defaultReceiver.receive(r);
finalPile.clear(); finalPile.clear();
finalTrain.clear(); finalTrain.clear();
curr_stop = -1; curr_stop = -1;
@ -444,7 +450,8 @@ public class IndelRecordPileCollector implements RecordReceiver {
long min = 1000000000; long min = 1000000000;
long max = 0; long max = 0;
all.append("passing indels:");
for ( CountedObject<Indel> o : indels ) { for ( CountedObject<Indel> o : indels ) {
if ( o.getCount() < 2 ) continue; if ( o.getCount() < 2 ) continue;
all.append(" "); all.append(" ");
@ -452,9 +459,9 @@ public class IndelRecordPileCollector implements RecordReceiver {
if ( o.getObject().getIndelLength() < min ) min = o.getObject().getIndelLength(); if ( o.getObject().getIndelLength() < min ) min = o.getObject().getIndelLength();
if ( o.getObject().getIndelLength() > max ) max = o.getObject().getIndelLength(); if ( o.getObject().getIndelLength() > max ) max = o.getObject().getIndelLength();
} }
b.append(" min: "); b.append("; passing min length: ");
b.append(min); b.append(min);
b.append(" max: "); b.append("; passing max length: ");
b.append(max); b.append(max);
b.append(all); b.append(all);
return b.toString(); return b.toString();

View File

@ -1,11 +1,18 @@
package org.broadinstitute.sting.indels; package org.broadinstitute.sting.indels;
import net.sf.samtools.SAMRecord;
public class PileBuilder { import java.util.Collection;
public class PileBuilder implements RecordPileReceiver {
private StrictlyUpperTriangularMatrix distances ; private StrictlyUpperTriangularMatrix distances ;
private Matrix<PairwiseAlignment> alignments ; private Matrix<PairwiseAlignment> alignments ;
private static final int KmerSize = 8;
private static class SelectedPair { private MultipleAlignment alignments1;
private MultipleAlignment alignments2;
private static class SelectedPair {
private int i_; private int i_;
private int j_; private int j_;
private double d_; private double d_;
@ -40,7 +47,23 @@ public class PileBuilder {
} }
public PileBuilder() {} public PileBuilder() {}
public void receive(Collection<SAMRecord> c) {
IndexedSequence[] seqs = new IndexedSequence[c.size()];
int i = 0;
for ( SAMRecord r : c ) {
seqs[i++] = new IndexedSequence(r.getReadString(),KmerSize);
}
doMultipleAlignment(seqs);
System.out.print("Distance between final piles: "+distance(alignments1, alignments2));
System.out.println("; diameter of PILE1: "+ diameter(alignments1));
System.out.println("; diameter of PILE2: "+ diameter(alignments2));
System.out.println("PILE 1: \n"+alignments1.toString());
System.out.println("PILE 2: \n"+alignments2.toString());
}
public void initPairwiseAlignments( IndexedSequence [] seqs ) { public void initPairwiseAlignments( IndexedSequence [] seqs ) {
distances = new StrictlyUpperTriangularMatrix( seqs.length ); distances = new StrictlyUpperTriangularMatrix( seqs.length );
alignments = new Matrix<PairwiseAlignment>( seqs.length ); alignments = new Matrix<PairwiseAlignment>( seqs.length );
@ -57,7 +80,7 @@ public class PileBuilder {
* so that the alignments are pre-computed. * so that the alignments are pre-computed.
* @return id's of the two sequences and the distance between them in a compound object. * @return id's of the two sequences and the distance between them in a compound object.
*/ */
public SelectedPair findBest() { public SelectedPair findClosestPair() {
SelectedPair p = new SelectedPair(-1,-1,1e100); SelectedPair p = new SelectedPair(-1,-1,1e100);
@ -95,7 +118,7 @@ public class PileBuilder {
* @param pexclude neither of the two sequences in the returned pair can have its id listed in pexclude pair. * @param pexclude neither of the two sequences in the returned pair can have its id listed in pexclude pair.
* @return Best pairwise alignment excluding alignments between pairs involving at least one sequence from pexclude * @return Best pairwise alignment excluding alignments between pairs involving at least one sequence from pexclude
*/ */
public SelectedPair findBest(SelectedPair pexclude) { public SelectedPair findNextClosestPairAfter(SelectedPair pexclude) {
SelectedPair p = new SelectedPair(-1,-1,1e100); SelectedPair p = new SelectedPair(-1,-1,1e100);
@ -117,7 +140,7 @@ public class PileBuilder {
* @return a compound SelectedPair object that contains the index of the closest sequence found (is guaranteed to * @return a compound SelectedPair object that contains the index of the closest sequence found (is guaranteed to
* be not in the pile), the index of the sequence in the pile it is closest to, and the actual distance between the two. * be not in the pile), the index of the sequence in the pile it is closest to, and the actual distance between the two.
*/ */
public SelectedPair findClosest(MultipleAlignment a) { public SelectedPair findClosestToPile(MultipleAlignment a) {
SelectedPair p = new SelectedPair(-1,-1,1e100); SelectedPair p = new SelectedPair(-1,-1,1e100);
@ -143,7 +166,7 @@ public class PileBuilder {
* @param a alignment pile to find the closest sequence for * @param a alignment pile to find the closest sequence for
* @return index of the farthest sequence * @return index of the farthest sequence
*/ */
public int findFarthest(MultipleAlignment a) { public int findFarthestFromPile(MultipleAlignment a) {
double dmaxmin = 0; double dmaxmin = 0;
int i_out = -1; int i_out = -1;
@ -209,17 +232,22 @@ public class PileBuilder {
IndexedSequence [] seqs = testSet3(K); // initialize test set data IndexedSequence [] seqs = testSet3(K); // initialize test set data
PileBuilder pb = new PileBuilder(); PileBuilder pb = new PileBuilder();
pb.doMultipleAlignment(seqs);
}
public void doMultipleAlignment(IndexedSequence[] seqs) {
// two piles we are going to grow until all sequences are assigned to one of them. // two piles we are going to grow until all sequences are assigned to one of them.
// we intend to keep the piles disjoint, e.g. no sequence should be placed in both // we intend to keep the piles disjoint, e.g. no sequence should be placed in both
MultipleAlignment pile1 = new MultipleAlignment(); MultipleAlignment pile1 = new MultipleAlignment();
MultipleAlignment pile2 = new MultipleAlignment(); MultipleAlignment pile2 = new MultipleAlignment();
pb.initPairwiseAlignments(seqs); initPairwiseAlignments(seqs);
// all the pairwise alignments are computed and disjoint best and next-best pairs are found // all the pairwise alignments are computed and disjoint best and next-best pairs are found
System.out.println( pb.distances.format("%8.4g ")); //System.out.println( distances.format("%8.4g "));
/* /*
SelectedPair pworst = pb.findWorst(); SelectedPair pworst = pb.findWorst();
@ -229,10 +257,10 @@ public class PileBuilder {
*/ */
// initialize piles with best and next-best pairs // initialize piles with best and next-best pairs
SelectedPair p_best = pb.findBest(); SelectedPair p_best = findClosestPair();
SelectedPair p_nextbest = pb.findBest(p_best); SelectedPair p_nextbest = findNextClosestPairAfter(p_best);
pile1.add( pb.alignments.get(p_best.i(), p_best.j()),p_best.i(),p_best.j()); pile1.add( alignments.get(p_best.i(), p_best.j()),p_best.i(),p_best.j());
pile2.add( pb.alignments.get(p_nextbest.i(), p_nextbest.j()),p_nextbest.i(),p_nextbest.j()); pile2.add( alignments.get(p_nextbest.i(), p_nextbest.j()),p_nextbest.i(),p_nextbest.j());
/* /*
System.out.println("Best pair ("+p_best.i() + "," + p_best.j()+", d="+p_best.d()+"):"); System.out.println("Best pair ("+p_best.i() + "," + p_best.j()+", d="+p_best.d()+"):");
System.out.println(pile1.toString()); System.out.println(pile1.toString());
@ -245,42 +273,39 @@ public class PileBuilder {
// grow piles hierarchical clustering-style // grow piles hierarchical clustering-style
while ( pile1.size() + pile2.size() < seqs.length ) { while ( pile1.size() + pile2.size() < seqs.length ) {
// find candidate sequences closest to each of the two piles // find candidate sequences closest to each of the two piles
p1 = pb.findClosest(pile1); p1 = findClosestToPile(pile1);
p2 = pb.findClosest(pile2); p2 = findClosestToPile(pile2);
int id1_cand = pile1.selectExternal(p1.i(), p1.j()); // id of the sequence closest to the pile 1 int id1_cand = pile1.selectExternal(p1.i(), p1.j()); // id of the sequence closest to the pile 1
int id2_cand = pile2.selectExternal(p2.i(), p2.j()); // id of the sequence closest to the pile 2 int id2_cand = pile2.selectExternal(p2.i(), p2.j()); // id of the sequence closest to the pile 2
if ( pile2.contains(id1_cand) && pile1.contains(id2_cand)) { if ( pile2.contains(id1_cand) && pile1.contains(id2_cand)) {
// pile1 and pile 2 are mutually the closest, so we need to merge them. // pile1 and pile 2 are mutually the closest, so we need to merge them.
// if piles are mutually the closest, then p1 and p2 are the same pair (id1, id2), // if piles are mutually the closest, then p1 and p2 are the same pair (id1, id2),
// so we just merge on one of the (redundant) instances: // so we just merge on one of the (redundant) instances:
pile1.add(pile2, pb.alignments.get( p1.i(), p1.j()),p1.i(), p1.j()); pile1.add(pile2, alignments.get( p1.i(), p1.j()),p1.i(), p1.j());
pile2.clear(); // need to reset pile 2 to something else pile2.clear(); // need to reset pile 2 to something else
int z = pb.findFarthest(pile1); // get sequence farthest from merged pile 1 int z = findFarthestFromPile(pile1); // get sequence farthest from merged pile 1
pile2.add(seqs[z].getSequence(), z); // and reinitialize pile 2 with that sequence pile2.add(seqs[z].getSequence(), z); // and reinitialize pile 2 with that sequence
} else { } else {
if ( p1.d() < p2.d() ) { if ( p1.d() < p2.d() ) {
if ( pile2.contains(id1_cand) ) { if ( pile2.contains(id1_cand) ) {
pile1.add(pile2, pb.alignments.get( p1.i(), p1.j()),p1.i(), p1.j()); pile1.add(pile2, alignments.get( p1.i(), p1.j()),p1.i(), p1.j());
pile2.clear(); // need to reset pile 2 to something else pile2.clear(); // need to reset pile 2 to something else
int z = pb.findFarthest(pile1); // get sequence farthest from merged pile 1 int z = findFarthestFromPile(pile1); // get sequence farthest from merged pile 1
pile2.add(seqs[z].getSequence(), z); // and reinitialize pile 2 with that sequence pile2.add(seqs[z].getSequence(), z); // and reinitialize pile 2 with that sequence
} else pile1.add( pb.alignments.get(p1.i(), p1.j()) ); } else pile1.add( alignments.get(p1.i(), p1.j()) );
} else { } else {
if ( pile1.contains(id2_cand) ) { if ( pile1.contains(id2_cand) ) {
pile2.add(pile1, pb.alignments.get( p2.i(), p2.j()),p2.i(), p2.j()); pile2.add(pile1, alignments.get( p2.i(), p2.j()),p2.i(), p2.j());
pile1.clear(); // need to reset pile 2 to something else pile1.clear(); // need to reset pile 2 to something else
int z = pb.findFarthest(pile2); // get sequence farthest from merged pile 1 int z = findFarthestFromPile(pile2); // get sequence farthest from merged pile 1
pile1.add(seqs[z].getSequence(), z); // and reinitialize pile 2 with that sequence pile1.add(seqs[z].getSequence(), z); // and reinitialize pile 2 with that sequence
} else pile2.add( pb.alignments.get(p2.i(), p2.j()) ); } else pile2.add( alignments.get(p2.i(), p2.j()) );
} }
} }
System.out.println("PILE 1: \n"+pile1.toString());
System.out.println("PILE 2: \n"+pile2.toString());
} // end WHILE } // end WHILE
System.out.println("Distance between final piles: "+pb.distance(pile1, pile2)); alignments1 = pile1;
System.out.println("Diameter of PILE1: "+ pb.diameter(pile1)); alignments2 = pile2;
System.out.println("Diameter of PILE2: "+ pb.diameter(pile2));
/* /*
* System.out.println("Closest distance to the pile: " + best_d * System.out.println("Closest distance to the pile: " + best_d
+ "(adding: " + best_i + "," + best_j + "):"); + "(adding: " + best_i + "," + best_j + "):");