diff --git a/playground/java/src/org/broadinstitute/sting/indels/IndelInspector.java b/playground/java/src/org/broadinstitute/sting/indels/IndelInspector.java index 7b16b94bb..86aca048d 100755 --- a/playground/java/src/org/broadinstitute/sting/indels/IndelInspector.java +++ b/playground/java/src/org/broadinstitute/sting/indels/IndelInspector.java @@ -23,7 +23,7 @@ public class IndelInspector extends CommandLineProgram { // Usage and parameters @Usage(programVersion="1.0") public String USAGE = "Investigates indels called in the alignment data\n"; - @Option(shortName="I", doc="SAM or BAM file for calling") public File INPUT_FILE; + @Option(shortName="I", doc="SAM or BAM file for calling",optional=true) public File INPUT_FILE; @Option(shortName="L",doc="Genomic interval to run on, as contig[:start[-stop]]; whole genome if not specified", optional=true) public String GENOME_LOCATION; @Option(doc="Error counting mode: MM - count mismatches only, ERR - count errors (arachne style), MG - count mismatches and gaps as one error each") public String ERR_MODE; @Option(doc="Maximum number of errors allowed (see ERR_MODE)") public Integer MAX_ERRS; @@ -36,7 +36,7 @@ public class IndelInspector extends CommandLineProgram { protected int doWork() { - System.out.println("I am at version 0.1"); + System.out.println("I am at version 0.2"); GenomeLoc location = null; if ( GENOME_LOCATION != null ) { location = GenomeLoc.parseGenomeLoc(GENOME_LOCATION); @@ -47,7 +47,8 @@ public class IndelInspector extends CommandLineProgram { return 1; } - final SAMFileReader samReader = new SAMFileReader(getInputFile(INPUT_FILE,"/broad/1kG/")); + final SAMFileReader samReader = new SAMFileReader(getInputFile(INPUT_FILE,"/broad/1KG/")); + samReader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT); setContigOrdering(samReader); @@ -69,6 +70,7 @@ public class IndelInspector extends CommandLineProgram { for ( SAMRecord r : samReader ) { + if ( r.getReadUnmappedFlag() ) continue; if ( r.getReferenceName() != cur_contig) { cur_contig = r.getReferenceName(); System.out.println("Contig "+cur_contig); @@ -173,6 +175,7 @@ public class IndelInspector extends CommandLineProgram { } private static int numMismatchesDirect(SAMRecord r, ReferenceSequence ref) { + if ( r.getReadUnmappedFlag() ) return 1000000; int i_ref = r.getAlignmentStart()-1; // position on the ref int i_read = 0; // position on the read int mm_count = 0; // number of mismatches diff --git a/playground/java/src/org/broadinstitute/sting/indels/IndelRecordPileCollector.java b/playground/java/src/org/broadinstitute/sting/indels/IndelRecordPileCollector.java index 78e6e19c5..5a1997b3b 100755 --- a/playground/java/src/org/broadinstitute/sting/indels/IndelRecordPileCollector.java +++ b/playground/java/src/org/broadinstitute/sting/indels/IndelRecordPileCollector.java @@ -310,11 +310,6 @@ public class IndelRecordPileCollector implements RecordReceiver { // this means that the current train and pile of reads overlapping with it are fully built; // emit into indel receiver if the train is interesting enough, or into the nonindel receiver: - System.out.print(mLastContig+":"+ finalTrain.get(0).getObject().getStart() + "-" + - finalTrain.get(finalTrain.size()-1).getObject().getStop() + " " + - finalTrain.size() + " indels"); - System.out.println(formatRange(finalTrain)); - if ( shouldAcceptForOutput(finalTrain ) ) { System.out.print(mLastContig+":"+ finalTrain.get(0).getObject().getStart() + "-" + finalTrain.get(finalTrain.size()-1).getObject().getStop() + " " + @@ -451,7 +446,7 @@ public class IndelRecordPileCollector implements RecordReceiver { long min = 1000000000; long max = 0; - all.append("passing indels:"); + all.append("; passing indels:"); for ( CountedObject o : indels ) { if ( o.getCount() < 2 ) continue; all.append(" "); @@ -459,7 +454,9 @@ public class IndelRecordPileCollector implements RecordReceiver { if ( o.getObject().getIndelLength() < min ) min = o.getObject().getIndelLength(); if ( o.getObject().getIndelLength() > max ) max = o.getObject().getIndelLength(); } - b.append("; passing min length: "); + if ( max == 0 ) return new String(); // no passinf indels, return empty string + + b.append(" passing min length: "); b.append(min); b.append("; passing max length: "); b.append(max); diff --git a/playground/java/src/org/broadinstitute/sting/indels/MultipleAlignment.java b/playground/java/src/org/broadinstitute/sting/indels/MultipleAlignment.java index b8c29dc6f..b6e79a567 100755 --- a/playground/java/src/org/broadinstitute/sting/indels/MultipleAlignment.java +++ b/playground/java/src/org/broadinstitute/sting/indels/MultipleAlignment.java @@ -1,18 +1,14 @@ package org.broadinstitute.sting.indels; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; +import java.util.*; public class MultipleAlignment implements Iterable { private static final int IMPOSSIBLE = 1000000000; - private Map index; - private List seqs; - private List ext_ids; - private List alignment_offsets; // offset of seqs[i] w/respect to seqs[0] + private Map index; // maps external id of the sequence onto its index in the pile + private List seqs; // sequences, in order they were added + private List ext_ids; // external ids of the sequences, in order they were added to the pile + private List alignment_offsets; // offset of seqs[i] w/respect to seqs[0] (i.e. in order the seqs were added) private int best_mm; // mismatch count private int next_mm; // next-best mismatch count @@ -29,7 +25,27 @@ public class MultipleAlignment implements Iterable { alignment_offsets.clear(); ext_ids.clear(); } - + + public static Integer[] SortPermutation( List A ) + { + final Object[] data = A.toArray(); + + class comparator implements Comparator + { + public int compare(Integer a, Integer b) + { + return ((T)data[a]).compareTo(data[b]); + } + } + Integer[] permutation = new Integer[A.size()]; + for (int i = 0; i < A.size(); i++) + { + permutation[i] = i; + } + Arrays.sort(permutation, new comparator()); + return permutation; + } + /** Adds single sequence with id set to i. Pile must be empty, or IllegalStateException will be thrown * * @param seq sequence to add @@ -182,11 +198,22 @@ public class MultipleAlignment implements Iterable { return i; } } - + + public String skipN(int n) { + StringBuilder b=new StringBuilder(); + for ( int k = 0 ; k < n ; k++ ) b.append(' '); + return b.toString(); + } + + public void skipN(int n, StringBuilder b) { + for ( int k = 0 ; k < n ; k++ ) b.append(' '); + } + /** Returns a (multiline) string that represents the alignment visually: the sequences are appropriately * shifted and ready for printout; */ - public String toString() { + public String toString(boolean inorder) { + StringBuilder b = new StringBuilder(); java.util.Formatter frmt = new java.util.Formatter(b); @@ -196,23 +223,23 @@ public class MultipleAlignment implements Iterable { for ( int i = 0 ; i < seqs.size() ; i++ ) { if ( -alignment_offsets.get(i) > skip_first ) skip_first = -alignment_offsets.get(i); } + + Integer[] perm = null; + if ( inorder ) perm = SortPermutation(alignment_offsets); - frmt.format("%3d:", ext_ids.get(0)); - for ( int k = 0 ; k < skip_first ; k++ ) b.append(' '); - b.append(seqs.get(0)); - b.append('\n'); - - for ( int i = 1 ; i < seqs.size() ; i++ ) { - frmt.format("%3d:", ext_ids.get(i)); - int skip = alignment_offsets.get(i) + skip_first ; - for ( int k = 0 ; k < skip ; k++ ) b.append(' '); - b.append(seqs.get(i)); + for ( int i = 0 ; i < seqs.size() ; i++ ) { + int index = (inorder ? perm[i] : i); + frmt.format("%3d:", ext_ids.get(index)); + skipN(alignment_offsets.get(index)+skip_first,b); + b.append(seqs.get(index)); b.append('\n'); } // b.append(best_mm+" mismatches, "+ next_mm + " next best, " + getOverlap() + " overlapping bases, distance=" + distance() + "\n"); return b.toString(); } - + + public String toString() { return toString(true); } + public int size() { return seqs.size(); } /** Returns an iterator over the id's of the sequences currently stored in the pile @@ -221,8 +248,27 @@ public class MultipleAlignment implements Iterable { */ public Iterator sequenceIdIterator() { return index.keySet().iterator(); } + public Iterator sequenceIdByOffsetIterator() { + final Integer[] perm = SortPermutation(alignment_offsets); + return new Iterator() { + private int i = 0; + public boolean hasNext() { + return i < perm.length; + } + public Integer next() { + return ext_ids.get(perm[i++]); + } + public void remove() { + throw new UnsupportedOperationException("remove not supported"); + } + } ; + + } + @Override public Iterator iterator() { return sequenceIdIterator(); } + + } diff --git a/playground/java/src/org/broadinstitute/sting/indels/PileBuilder.java b/playground/java/src/org/broadinstitute/sting/indels/PileBuilder.java index 16686f5bd..772157aba 100755 --- a/playground/java/src/org/broadinstitute/sting/indels/PileBuilder.java +++ b/playground/java/src/org/broadinstitute/sting/indels/PileBuilder.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.indels; import net.sf.samtools.SAMRecord; import java.util.Collection; +import java.util.Iterator; public class PileBuilder implements RecordPileReceiver { @@ -56,7 +57,7 @@ public class PileBuilder implements RecordPileReceiver { } doMultipleAlignment(seqs); System.out.print("Distance between final piles: "+distance(alignments1, alignments2)); - System.out.println("; diameter of PILE1: "+ diameter(alignments1)); + System.out.print("; diameter of PILE1: "+ diameter(alignments1)); System.out.println("; diameter of PILE2: "+ diameter(alignments2)); System.out.println("PILE 1: \n"+alignments1.toString()); @@ -209,18 +210,23 @@ public class PileBuilder implements RecordPileReceiver { */ public double diameter(MultipleAlignment a) { double dmaxmin = 0.0; - for ( Integer id1 : a ) { - double d = 1e100; - for ( Integer id2 : a ) { - if ( id2 <= id1 ) continue; - if ( distances.get(id1,id2) < d ) d = distances.get(id1,id2); + System.out.print("\n["); + Iterator ids1 = a.sequenceIdByOffsetIterator(); + while ( ids1.hasNext() ) { + Integer id1 = ids1.next(); + double d = 1e100; // will hold distance from id1 to its closest neighbor + for ( Integer id2 : a ) { + if ( id2 == id1 ) continue; + double dpair = ( id1 < id2 ? distances.get(id1,id2) : distances.get(id2,id1) ); + d = Math.min(d,dpair); } // d = distance from id1 to its closest neighbor within the pile if ( d < 1e99 ) System.out.printf("%8.4g",d); if ( d < 1e99 && d > dmaxmin ) dmaxmin = d; } + System.out.println(" ]"); // dmaxmin = the largest distance from a sequence in this pile to its closest neighbor - System.out.println(); +// System.out.println(); return dmaxmin; } @@ -229,11 +235,18 @@ public class PileBuilder implements RecordPileReceiver { int K=8; // IndexedSequence [] seqs = testSet1(K); // initialize test set data // IndexedSequence [] seqs = testSet2(K); // initialize test set data - IndexedSequence [] seqs = testSet3(K); // initialize test set data - +// IndexedSequence [] seqs = testSet3(K); // initialize test set data + IndexedSequence [] seqs = testSet4(K); // initialize test set data + PileBuilder pb = new PileBuilder(); pb.doMultipleAlignment(seqs); + System.out.print("Distance between final piles: "+pb.distance(pb.alignments1, pb.alignments2)); + System.out.print("; diameter of PILE1: "+ pb.diameter(pb.alignments1)); + System.out.println("; diameter of PILE2: "+ pb.diameter(pb.alignments2)); + + System.out.println("PILE 1: \n"+pb.alignments1.toString()); + System.out.println("PILE 2: \n"+pb.alignments2.toString()); } public void doMultipleAlignment(IndexedSequence[] seqs) { @@ -359,4 +372,28 @@ public class PileBuilder implements RecordPileReceiver { seqs[10] = new IndexedSequence("GTCTGGTGAGGGTAGGGTGCACTCTCTGCTTCATAAATGGGTCTCTTGCCGCAAAAAAATCTGTTTGCTCCTCCAG",K); return seqs; } + + public static IndexedSequence[] testSet4(int K) { + IndexedSequence [] seqs = new IndexedSequence[19]; + seqs[0] = new IndexedSequence("CGTGTGTGTGTGTGCAGTGCGTGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTTTGTGAGATC",K); + seqs[1] = new IndexedSequence("ATGTGTGTGTGTGCAGTGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCAT",K); + seqs[2] = new IndexedSequence("GTGTGTGTGTGCAGTGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGC",K); + seqs[3] = new IndexedSequence("TGTGTGTGTGCAGTGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCA",K); + seqs[4] = new IndexedSequence("GTGTGTGTGCAGTGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCAT",K); + seqs[5] = new IndexedSequence("GTGTGTGTGCCGTGCTTTGTGCTGTGAGATCTGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCTGCAT",K); + seqs[6] = new IndexedSequence("GTGTGTGCAGTGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGT",K); + seqs[7] = new IndexedSequence("GTGCAGTGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGTGTGT",K); + seqs[8] = new IndexedSequence("TGCAGTGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGTGTGTG",K); + seqs[9] = new IndexedSequence("AGTGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGTGTGTGTGT",K); + seqs[10] = new IndexedSequence("TGGGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGCAGCGCATGGTGCTGTGTGAGATCAGCGTGTGTGTGTGCAG",K); + seqs[11] = new IndexedSequence("GCTGTGAGATCAGCGTGTGTGTGTGAGCAGTGCATGGGGATGTGTGAGATCAGCATGTGTGTGTGTGTGCAGCGCG",K); + seqs[12] = new IndexedSequence("GCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGTGTGTGTGTGTGCAGTGCA",K); + seqs[13] = new IndexedSequence("AGATCAGCATGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGTGTGTGTGTGTGCAGTGCATGGTGC",K); + seqs[14] = new IndexedSequence("AGATCAGCGTGTGTGTGTGCAGCGCATGGCGCTGTGTGAGATCAGCATGTGTGTGTGTGTGCGGCGCATGGGGGTG",K); + seqs[15] = new IndexedSequence("GATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGAATGTGTGTGTGTGTGCAGTGCATGGTGCT",K); + seqs[16] = new IndexedSequence("ATCAGCATGGGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGTGTGGGTGTGTGGGGTGGGTGGTGGTG",K); + seqs[17] = new IndexedSequence("ATCAGCATGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGTGTGTGTGTGTGCAGTGCATGGGGCTG",K); + seqs[18] = new IndexedSequence("GTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGTGTGTGTGTGTGCAGTGCATGGTGCTGAGTGTG",K); + return seqs; + } }