minor changes and shuffling code around; also, now when realigned piles are printed they are sorted by start position

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@125 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
asivache 2009-03-21 17:43:49 +00:00
parent 0ea44a5805
commit 497eea2e5c
4 changed files with 125 additions and 42 deletions

View File

@ -23,7 +23,7 @@ public class IndelInspector extends CommandLineProgram {
// Usage and parameters
@Usage(programVersion="1.0") public String USAGE = "Investigates indels called in the alignment data\n";
@Option(shortName="I", doc="SAM or BAM file for calling") public File INPUT_FILE;
@Option(shortName="I", doc="SAM or BAM file for calling",optional=true) public File INPUT_FILE;
@Option(shortName="L",doc="Genomic interval to run on, as contig[:start[-stop]]; whole genome if not specified", optional=true) public String GENOME_LOCATION;
@Option(doc="Error counting mode: MM - count mismatches only, ERR - count errors (arachne style), MG - count mismatches and gaps as one error each") public String ERR_MODE;
@Option(doc="Maximum number of errors allowed (see ERR_MODE)") public Integer MAX_ERRS;
@ -36,7 +36,7 @@ public class IndelInspector extends CommandLineProgram {
protected int doWork() {
System.out.println("I am at version 0.1");
System.out.println("I am at version 0.2");
GenomeLoc location = null;
if ( GENOME_LOCATION != null ) {
location = GenomeLoc.parseGenomeLoc(GENOME_LOCATION);
@ -47,7 +47,8 @@ public class IndelInspector extends CommandLineProgram {
return 1;
}
final SAMFileReader samReader = new SAMFileReader(getInputFile(INPUT_FILE,"/broad/1kG/"));
final SAMFileReader samReader = new SAMFileReader(getInputFile(INPUT_FILE,"/broad/1KG/"));
samReader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT);
setContigOrdering(samReader);
@ -69,6 +70,7 @@ public class IndelInspector extends CommandLineProgram {
for ( SAMRecord r : samReader ) {
if ( r.getReadUnmappedFlag() ) continue;
if ( r.getReferenceName() != cur_contig) {
cur_contig = r.getReferenceName();
System.out.println("Contig "+cur_contig);
@ -173,6 +175,7 @@ public class IndelInspector extends CommandLineProgram {
}
private static int numMismatchesDirect(SAMRecord r, ReferenceSequence ref) {
if ( r.getReadUnmappedFlag() ) return 1000000;
int i_ref = r.getAlignmentStart()-1; // position on the ref
int i_read = 0; // position on the read
int mm_count = 0; // number of mismatches

View File

@ -310,11 +310,6 @@ public class IndelRecordPileCollector implements RecordReceiver {
// this means that the current train and pile of reads overlapping with it are fully built;
// emit into indel receiver if the train is interesting enough, or into the nonindel receiver:
System.out.print(mLastContig+":"+ finalTrain.get(0).getObject().getStart() + "-" +
finalTrain.get(finalTrain.size()-1).getObject().getStop() + " " +
finalTrain.size() + " indels");
System.out.println(formatRange(finalTrain));
if ( shouldAcceptForOutput(finalTrain ) ) {
System.out.print(mLastContig+":"+ finalTrain.get(0).getObject().getStart() + "-" +
finalTrain.get(finalTrain.size()-1).getObject().getStop() + " " +
@ -451,7 +446,7 @@ public class IndelRecordPileCollector implements RecordReceiver {
long min = 1000000000;
long max = 0;
all.append("passing indels:");
all.append("; passing indels:");
for ( CountedObject<Indel> o : indels ) {
if ( o.getCount() < 2 ) continue;
all.append(" ");
@ -459,7 +454,9 @@ public class IndelRecordPileCollector implements RecordReceiver {
if ( o.getObject().getIndelLength() < min ) min = o.getObject().getIndelLength();
if ( o.getObject().getIndelLength() > max ) max = o.getObject().getIndelLength();
}
b.append("; passing min length: ");
if ( max == 0 ) return new String(); // no passinf indels, return empty string
b.append(" passing min length: ");
b.append(min);
b.append("; passing max length: ");
b.append(max);

View File

@ -1,18 +1,14 @@
package org.broadinstitute.sting.indels;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.*;
public class MultipleAlignment implements Iterable<Integer> {
private static final int IMPOSSIBLE = 1000000000;
private Map<Integer,Integer> index;
private List<String> seqs;
private List<Integer> ext_ids;
private List<Integer> alignment_offsets; // offset of seqs[i] w/respect to seqs[0]
private Map<Integer,Integer> index; // maps external id of the sequence onto its index in the pile
private List<String> seqs; // sequences, in order they were added
private List<Integer> ext_ids; // external ids of the sequences, in order they were added to the pile
private List<Integer> alignment_offsets; // offset of seqs[i] w/respect to seqs[0] (i.e. in order the seqs were added)
private int best_mm; // mismatch count
private int next_mm; // next-best mismatch count
@ -29,7 +25,27 @@ public class MultipleAlignment implements Iterable<Integer> {
alignment_offsets.clear();
ext_ids.clear();
}
public static <T extends Comparable> Integer[] SortPermutation( List<T> A )
{
final Object[] data = A.toArray();
class comparator implements Comparator<Integer>
{
public int compare(Integer a, Integer b)
{
return ((T)data[a]).compareTo(data[b]);
}
}
Integer[] permutation = new Integer[A.size()];
for (int i = 0; i < A.size(); i++)
{
permutation[i] = i;
}
Arrays.sort(permutation, new comparator());
return permutation;
}
/** Adds single sequence with id set to i. Pile must be empty, or IllegalStateException will be thrown
*
* @param seq sequence to add
@ -182,11 +198,22 @@ public class MultipleAlignment implements Iterable<Integer> {
return i;
}
}
public String skipN(int n) {
StringBuilder b=new StringBuilder();
for ( int k = 0 ; k < n ; k++ ) b.append(' ');
return b.toString();
}
public void skipN(int n, StringBuilder b) {
for ( int k = 0 ; k < n ; k++ ) b.append(' ');
}
/** Returns a (multiline) string that represents the alignment visually: the sequences are appropriately
* shifted and ready for printout;
*/
public String toString() {
public String toString(boolean inorder) {
StringBuilder b = new StringBuilder();
java.util.Formatter frmt = new java.util.Formatter(b);
@ -196,23 +223,23 @@ public class MultipleAlignment implements Iterable<Integer> {
for ( int i = 0 ; i < seqs.size() ; i++ ) {
if ( -alignment_offsets.get(i) > skip_first ) skip_first = -alignment_offsets.get(i);
}
Integer[] perm = null;
if ( inorder ) perm = SortPermutation(alignment_offsets);
frmt.format("%3d:", ext_ids.get(0));
for ( int k = 0 ; k < skip_first ; k++ ) b.append(' ');
b.append(seqs.get(0));
b.append('\n');
for ( int i = 1 ; i < seqs.size() ; i++ ) {
frmt.format("%3d:", ext_ids.get(i));
int skip = alignment_offsets.get(i) + skip_first ;
for ( int k = 0 ; k < skip ; k++ ) b.append(' ');
b.append(seqs.get(i));
for ( int i = 0 ; i < seqs.size() ; i++ ) {
int index = (inorder ? perm[i] : i);
frmt.format("%3d:", ext_ids.get(index));
skipN(alignment_offsets.get(index)+skip_first,b);
b.append(seqs.get(index));
b.append('\n');
}
// b.append(best_mm+" mismatches, "+ next_mm + " next best, " + getOverlap() + " overlapping bases, distance=" + distance() + "\n");
return b.toString();
}
public String toString() { return toString(true); }
public int size() { return seqs.size(); }
/** Returns an iterator over the id's of the sequences currently stored in the pile
@ -221,8 +248,27 @@ public class MultipleAlignment implements Iterable<Integer> {
*/
public Iterator<Integer> sequenceIdIterator() { return index.keySet().iterator(); }
public Iterator<Integer> sequenceIdByOffsetIterator() {
final Integer[] perm = SortPermutation(alignment_offsets);
return new Iterator<Integer>() {
private int i = 0;
public boolean hasNext() {
return i < perm.length;
}
public Integer next() {
return ext_ids.get(perm[i++]);
}
public void remove() {
throw new UnsupportedOperationException("remove not supported");
}
} ;
}
@Override
public Iterator<Integer> iterator() {
return sequenceIdIterator();
}
}

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.indels;
import net.sf.samtools.SAMRecord;
import java.util.Collection;
import java.util.Iterator;
public class PileBuilder implements RecordPileReceiver {
@ -56,7 +57,7 @@ public class PileBuilder implements RecordPileReceiver {
}
doMultipleAlignment(seqs);
System.out.print("Distance between final piles: "+distance(alignments1, alignments2));
System.out.println("; diameter of PILE1: "+ diameter(alignments1));
System.out.print("; diameter of PILE1: "+ diameter(alignments1));
System.out.println("; diameter of PILE2: "+ diameter(alignments2));
System.out.println("PILE 1: \n"+alignments1.toString());
@ -209,18 +210,23 @@ public class PileBuilder implements RecordPileReceiver {
*/
public double diameter(MultipleAlignment a) {
double dmaxmin = 0.0;
for ( Integer id1 : a ) {
double d = 1e100;
for ( Integer id2 : a ) {
if ( id2 <= id1 ) continue;
if ( distances.get(id1,id2) < d ) d = distances.get(id1,id2);
System.out.print("\n[");
Iterator<Integer> ids1 = a.sequenceIdByOffsetIterator();
while ( ids1.hasNext() ) {
Integer id1 = ids1.next();
double d = 1e100; // will hold distance from id1 to its closest neighbor
for ( Integer id2 : a ) {
if ( id2 == id1 ) continue;
double dpair = ( id1 < id2 ? distances.get(id1,id2) : distances.get(id2,id1) );
d = Math.min(d,dpair);
}
// d = distance from id1 to its closest neighbor within the pile
if ( d < 1e99 ) System.out.printf("%8.4g",d);
if ( d < 1e99 && d > dmaxmin ) dmaxmin = d;
}
System.out.println(" ]");
// dmaxmin = the largest distance from a sequence in this pile to its closest neighbor
System.out.println();
// System.out.println();
return dmaxmin;
}
@ -229,11 +235,18 @@ public class PileBuilder implements RecordPileReceiver {
int K=8;
// IndexedSequence [] seqs = testSet1(K); // initialize test set data
// IndexedSequence [] seqs = testSet2(K); // initialize test set data
IndexedSequence [] seqs = testSet3(K); // initialize test set data
// IndexedSequence [] seqs = testSet3(K); // initialize test set data
IndexedSequence [] seqs = testSet4(K); // initialize test set data
PileBuilder pb = new PileBuilder();
pb.doMultipleAlignment(seqs);
System.out.print("Distance between final piles: "+pb.distance(pb.alignments1, pb.alignments2));
System.out.print("; diameter of PILE1: "+ pb.diameter(pb.alignments1));
System.out.println("; diameter of PILE2: "+ pb.diameter(pb.alignments2));
System.out.println("PILE 1: \n"+pb.alignments1.toString());
System.out.println("PILE 2: \n"+pb.alignments2.toString());
}
public void doMultipleAlignment(IndexedSequence[] seqs) {
@ -359,4 +372,28 @@ public class PileBuilder implements RecordPileReceiver {
seqs[10] = new IndexedSequence("GTCTGGTGAGGGTAGGGTGCACTCTCTGCTTCATAAATGGGTCTCTTGCCGCAAAAAAATCTGTTTGCTCCTCCAG",K);
return seqs;
}
public static IndexedSequence[] testSet4(int K) {
IndexedSequence [] seqs = new IndexedSequence[19];
seqs[0] = new IndexedSequence("CGTGTGTGTGTGTGCAGTGCGTGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTTTGTGAGATC",K);
seqs[1] = new IndexedSequence("ATGTGTGTGTGTGCAGTGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCAT",K);
seqs[2] = new IndexedSequence("GTGTGTGTGTGCAGTGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGC",K);
seqs[3] = new IndexedSequence("TGTGTGTGTGCAGTGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCA",K);
seqs[4] = new IndexedSequence("GTGTGTGTGCAGTGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCAT",K);
seqs[5] = new IndexedSequence("GTGTGTGTGCCGTGCTTTGTGCTGTGAGATCTGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCTGCAT",K);
seqs[6] = new IndexedSequence("GTGTGTGCAGTGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGT",K);
seqs[7] = new IndexedSequence("GTGCAGTGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGTGTGT",K);
seqs[8] = new IndexedSequence("TGCAGTGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGTGTGTG",K);
seqs[9] = new IndexedSequence("AGTGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGTGTGTGTGT",K);
seqs[10] = new IndexedSequence("TGGGCATGGTGCTGTGAGATCAGCGTGTGTGTGTGCAGCGCATGGTGCTGTGTGAGATCAGCGTGTGTGTGTGCAG",K);
seqs[11] = new IndexedSequence("GCTGTGAGATCAGCGTGTGTGTGTGAGCAGTGCATGGGGATGTGTGAGATCAGCATGTGTGTGTGTGTGCAGCGCG",K);
seqs[12] = new IndexedSequence("GCTGTGAGATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGTGTGTGTGTGTGCAGTGCA",K);
seqs[13] = new IndexedSequence("AGATCAGCATGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGTGTGTGTGTGTGCAGTGCATGGTGC",K);
seqs[14] = new IndexedSequence("AGATCAGCGTGTGTGTGTGCAGCGCATGGCGCTGTGTGAGATCAGCATGTGTGTGTGTGTGCGGCGCATGGGGGTG",K);
seqs[15] = new IndexedSequence("GATCAGCGTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGAATGTGTGTGTGTGTGCAGTGCATGGTGCT",K);
seqs[16] = new IndexedSequence("ATCAGCATGGGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGTGTGGGTGTGTGGGGTGGGTGGTGGTG",K);
seqs[17] = new IndexedSequence("ATCAGCATGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGTGTGTGTGTGTGCAGTGCATGGGGCTG",K);
seqs[18] = new IndexedSequence("GTGTGTGTGTGTGCAGTGCATGGTGCTGTGTGAGATCAGCATGTGTGTGTGTGTGCAGTGCATGGTGCTGAGTGTG",K);
return seqs;
}
}