a few tweaks to make it more robust: ignore reads with cigars containing anything but I,D,M; don't set up contig ordering manually, rely upon reference sequence and its dictionary; don't die if a record does not have NM tag, but faal back to direct counting instead; now requires reference as a cmdline arg

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@378 348d0f76-0448-11de-a6fe-93d51630548a
2009-04-13 04:49:19 +00:00 · 2009-04-13 04:49:19 +00:00 · b4136b6d6e
parent 32e000bbfe
commit b4136b6d6e
5 changed files with 268 additions and 212 deletions
--- a/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java
@ -127,7 +127,7 @@ public class rodSAMPileup extends ReferenceOrderedDatum implements AllelicVarian
            System.out.printf("  Exception caught during parsing Pileup line:  %s%n", Utils.join(" <=> ", parts));
            throw e;
        }
-        if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: multi-allelic variant at "+ loc.toString());
+        if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: multi-allelic variant :  ("+refBaseChar+") -->"+toMediumString());
    }

    
@ -243,10 +243,13 @@ public class rodSAMPileup extends ReferenceOrderedDatum implements AllelicVarian
    }

    public String toMediumString() {
-        String s = String.format("%s:%s:%s", getLocation().toString(), name, observedString);
+    	
+        String s = null;
+        if ( refBaseChar == '*' ) s = String.format("%s:%s:%s", getLocation().toString(), name,observedString);
+        else s = String.format("%s:%s:%s/%s", getLocation().toString(), name, observedAlleles.get(0),observedAlleles.get(1));
        if ( isSNP() ) s += ": SNP";
        else {
-        	if ( isInsertion() ) s += ": Insertoin";
+        	if ( isInsertion() ) s += ": Insertion";
        	else {
        		if ( isDeletion() ) s+= ": Deletion";
        		else {
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/MendelianInheritanceWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/MendelianInheritanceWalker.java
@ -31,7 +31,13 @@ public class MendelianInheritanceWalker  extends RefWalker<TrioConcordanceRecord
 		AllelicVariant dad = (rodSAMPileup)rodData.lookup("father", null);
 		AllelicVariant kid = (rodSAMPileup)rodData.lookup("daughter", null);
 		
-		if ( ! hasCall(mom)  || ! hasCall(dad) || ! hasCall(kid) ) return t; // at least one person is not called; nothing to do, bail out
+		if ( hasCall(mom)) t.mom_assessed = 1;
+		if ( hasCall(dad)) t.dad_assessed = 1;
+		if ( hasCall(kid)) t.kid_assessed = 1;
+		
+		if ( hasCall(mom) && mom.isIndel() ) System.out.println("GOT INDEL: "+mom.toString());
+		
+		if (( t.mom_assessed + t.dad_assessed + t.kid_assessed) != 3 ) return t; // at least one person is not called; nothing to do, bail out
 			
 		// proceed only if we got confident calls
 			
@ -62,11 +68,11 @@ public class MendelianInheritanceWalker  extends RefWalker<TrioConcordanceRecord
 				mom_alleles.contains(kid_allele_2) && dad_alleles.contains(kid_allele_1) ) {
 			t.consistent_snp = 1;
 			
-			logger.info("consistent SNP at "+context.getLocation() + 
-					"("+ref+") " + mom_alleles.get(0)+"/" +mom_alleles.get(1) + "  " +
-					dad_alleles.get(0)+"/" +dad_alleles.get(1) + "  " +
-					kid_allele_1+"/" +kid_allele_2  
-					);
+//			logger.info("consistent SNP at "+context.getLocation() + 
+//					"("+ref+") " + mom_alleles.get(0)+"/" +mom_alleles.get(1) + "  " +
+//					dad_alleles.get(0)+"/" +dad_alleles.get(1) + "  " +
+//					kid_allele_1+"/" +kid_allele_2  
+//					);
 			return t;
 	    }
 		
--- a/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java
+++ b/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java
@ -27,50 +27,56 @@ public class IndelInspector extends CommandLineProgram {
    @Option(doc="Output file (sam or bam) for non-indel related reads and indel reads that were not improved") public String OUT1; 
    @Option(doc="Output file (sam or bam) for improved (realigned) indel related reads") public String OUT2;
    @Option(doc="[paranoid] If true, all reads that would be otherwise picked and processed by this tool will be saved, unmodified, into OUT1", optional=true) public boolean CONTROL_RUN;
-    @Option(doc="Error counting mode: MM - count mismatches only, ERR - count errors (arachne style), MG - count mismatches and gaps as one error each") public String ERR_MODE;
+    @Option(doc="Error counting mode: MM - mismatches only (from sam tags), MC - mismatches only doing actual mismatch count on the fly (use this if tags are incorrectly set); ERR - errors (arachne style: mm+gap lengths), MG - count mismatches and gaps as one error each") public String ERR_MODE;
    @Option(doc="Maximum number of errors allowed (see ERR_MODE)") public Integer MAX_ERRS;
-//    @Option(shortName="R", doc="Reference fasta or fasta.gz file") public File REF_FILE;
+    @Option(shortName="R", doc="Reference fasta or fasta.gz file") public File REF_FILE;

    /** Required main method implementation. */
    public static void main(final String[] argv) {
        System.exit(new IndelInspector().instanceMain(argv));
    }
    
-	protected int doWork() {
+    protected int doWork() {

-        System.out.println("I am at version 0.3");
+        int discarded_count = 0;
+
+        ReferenceSequenceFileWalker reference = new ReferenceSequenceFileWalker(
+                    REF_FILE
+            );
+
+        if ( reference.getSequenceDictionary() == null ) {
+            System.out.println("No reference sequence dictionary found. Abort.");
+        }
+
+        GenomeLoc.setupRefContigOrdering(reference.getSequenceDictionary());
        GenomeLoc location = null;
        if ( GENOME_LOCATION != null ) {
            location = GenomeLoc.parseGenomeLoc(GENOME_LOCATION);
        }
+        
+        if ( ! ERR_MODE.equals("MM") && ! ERR_MODE.equals("MG") && ! ERR_MODE.equals("ERR") && ! ERR_MODE.equals("MC") ) {
+            System.out.println("Unknown value specified for ERR_MODE: "+ERR_MODE);
+            return 1;
+        }

-		if ( ! ERR_MODE.equals("MM") && ! ERR_MODE.equals("MG") && ! ERR_MODE.equals("ERR") ) {
-			System.out.println("Unknown value specified for ERR_MODE");
-			return 1;
-		}
-
-		final SAMFileReader samReader = new SAMFileReader(getInputFile(INPUT_FILE,"/broad/1KG/"));
+        final SAMFileReader samReader = new SAMFileReader(getInputFile(INPUT_FILE,"/broad/1KG/"));
        samReader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT);

-        setContigOrdering(samReader);
-
-        ReferenceSequenceFileWalker reference = new ReferenceSequenceFileWalker(
-                    new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
-            );
+        //        setContigOrdering(samReader);


        ReferenceSequence contig_seq = null;

-		IndelRecordPileCollector col = null;
+        IndelRecordPileCollector col = null;
        PassThroughWriter ptWriter = new PassThroughWriter(OUT1,samReader.getFileHeader());
        PileBuilder pileBuilder = null;
        if ( ! CONTROL_RUN ) pileBuilder = new PileBuilder(OUT2,samReader.getFileHeader(),ptWriter);

-		try {
+        try {
            if ( CONTROL_RUN ) col = new IndelRecordPileCollector(ptWriter, new DiscardingPileReceiver() );
-			else col = new IndelRecordPileCollector(ptWriter, pileBuilder );
-		} catch(Exception e) { System.err.println(e.getMessage()); }
-		if ( col == null ) return 1; 
+            else col = new IndelRecordPileCollector(ptWriter, pileBuilder );
+        } catch(Exception e) { System.err.println(e.getMessage()); }
+        if ( col == null ) return 1; 

        col.setControlRun(CONTROL_RUN);

@ -91,9 +97,9 @@ public class IndelInspector extends CommandLineProgram {
        for ( SAMRecord r : samReader ) {

            if ( r.getReadUnmappedFlag() ) continue; 
-        	if ( r.getReferenceName() != cur_contig) {
-        		cur_contig = r.getReferenceName();
-        		System.out.println("Contig "+cur_contig);
+            if ( r.getReferenceName() != cur_contig) {
+                cur_contig = r.getReferenceName();
+                System.out.println("Contig "+cur_contig);
                // if contig is specified and we are past that contig, we are done:
                if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == 1 ) break;
                if ( location == null || GenomeLoc.compareContigs(cur_contig, location.getContig()) == 0 ) {
@ -105,18 +111,33 @@ public class IndelInspector extends CommandLineProgram {
                }
            }

-            // if contig is specified and wqe did not reach it yet, skip the records until we reach that contig:
-        	if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == -1 ) continue;
+            // if contig is specified and we did not reach it yet, skip the records until we reach that contig:
+            if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == -1 ) continue;


            if ( location != null && r.getAlignmentEnd() < location.getStart() ) continue;

            // if stop position is specified and we are past that, stop reading:
-        	if ( location != null && r.getAlignmentStart() > location.getStop() ) break;
+            if ( location != null && r.getAlignmentStart() > location.getStop() ) break;

-        	if ( cur_contig.equals("chrM") || GenomeLoc.compareContigs(cur_contig,"chrY")==1 ) continue; // skip chrM and unplaced contigs for now
-        	
-        	int err = -1;
+            //    if ( cur_contig.equals("chrM") || GenomeLoc.compareContigs(cur_contig,"chrY") > 0 ) continue; // skip chrM and unplaced contigs for now
+
+            // we currently do not know how to deal with cigars containing elements other than M,I,D, so 
+            // let's just skip the reads that contain those other elements (clipped reads?)
+            Cigar c = r.getCigar();
+            boolean cigar_acceptable = true;
+            for ( int z = 0 ; z < c.numCigarElements() ; z++ ) {
+                CigarElement ce = c.getCigarElement(z);
+                switch ( ce.getOperator() ) {
+                case M:
+                case I:
+                case D: break;
+                default: cigar_acceptable = false;
+                }
+            }
+            if ( ! cigar_acceptable ) continue;
+
+            int err = -1;
 /*
            System.out.println("MM:     "+numMismatches(r));
            System.out.println("direct: "+numMismatchesDirect(r,contig_seq));
@ -130,13 +151,14 @@ public class IndelInspector extends CommandLineProgram {
            continue;
 */

-        	if ( ERR_MODE.equals("MM")) err = numMismatches(r);
-        	else if ( ERR_MODE.equals("ERR")) err = numErrors(r);
-        	else if ( ERR_MODE.equals("MG")) err = numMismatchesGaps(r);
-        	if ( err > MAX_ERRS.intValue() ) continue;
-//        	counter++;
-//        	if ( counter % 1000000 == 0 ) System.out.println(counter+" records; "+col.memStatsString());
-        	col.receive(r);
+            if ( ERR_MODE.equals("MM")) err = numMismatches(r,contig_seq);
+            else if ( ERR_MODE.equals("MC") ) err = AlignmentUtils.numMismatches(r,contig_seq);
+            else if ( ERR_MODE.equals("ERR")) err = numErrors(r,contig_seq);
+            else if ( ERR_MODE.equals("MG")) err = numMismatchesGaps(r,contig_seq);
+            if ( err > MAX_ERRS.intValue() ) continue;
+            //        	counter++;
+            //        	if ( counter % 1000000 == 0 ) System.out.println(counter+" records; "+col.memStatsString());
+            col.receive(r);

        }
        
@ -149,7 +171,7 @@ public class IndelInspector extends CommandLineProgram {
        samReader.close();
        ptWriter.close();
        return 0;
-	}
+    }
 	
 	/** This method is a HACK: it is designed to work around the current bug in NM tags created  at CRD 
 	 * 
@ -157,10 +179,10 @@ public class IndelInspector extends CommandLineProgram {
 	 * @return number of errors (number of mismatches plus total length of all insertions/deletions
 	 * @throws RuntimeException
 	 */
-	private static int numErrors(SAMRecord r) throws RuntimeException {
+    private static int numErrors(SAMRecord r, ReferenceSequence refseq) throws RuntimeException {
 		
 		// NM currently stores the total number of mismatches in all blocks + 1
-		int errs = numMismatches(r);
+        int errs = numMismatches(r,refseq);
 		
 		// now we have to add the total length of all indels:
 		Cigar c = r.getCigar();
@ -185,10 +207,10 @@ public class IndelInspector extends CommandLineProgram {
 	 * deletion will be counted as a single error regardless of the length)
 	 * @throws RuntimeException
 	 */
-	private static int numMismatchesGaps(SAMRecord r) throws RuntimeException {
+    private static int numMismatchesGaps(SAMRecord r,ReferenceSequence refseq) throws RuntimeException {
 		
 		// NM currently stores the total number of mismatches in all blocks + 1
-		int errs = numMismatches(r);
+        int errs = numMismatches(r,refseq);
 		
 		// now we have to add the total length of all indels:
 		Cigar c = r.getCigar();
@ -207,12 +229,14 @@ public class IndelInspector extends CommandLineProgram {
 	}

    /** This method is a HACK: it is designed to work around the current bug in NM tags created  at CRD */
-	private static int numMismatches(SAMRecord r) throws RuntimeException {
+    private static int numMismatches(SAMRecord r, ReferenceSequence refseq) throws RuntimeException {
 		
-		// NM currently stores the total number of mismatches in all blocks + 1
-		return ((Integer)r.getAttribute("NM")).intValue() - 1;
+        // NM currently stores the total number of mismatches in all blocks + 1
+        Integer i = (Integer)r.getAttribute("NM");
+        if ( i == null ) return AlignmentUtils.numMismatches(r,refseq);
+        return ((Integer)r.getAttribute("NM")).intValue() - 1;
 		
-	}
+    }

    /** Trivial utility method that goes some distance trying to ensure that the input file is there;
     * the only purpose is reducing clutter in main(). Receives a default
@ -265,7 +289,7 @@ public class IndelInspector extends CommandLineProgram {
    private void setDefaultContigOrdering() {
        Map<String,Integer> rco = new HashMap<String,Integer>();
        rco.put("chrM",0);
-        for ( int i = 1 ; i <= 22 ; i++ ) rco.put("chr"+i,i);
+        for ( int i = 1 ; i <= 22 ; i++ ) rco.put(Integer.toString(i),i);//rco.put("chr"+i,i);
        rco.put("chrX",23);
        rco.put("chrY",24);
    }
--- a/java/src/org/broadinstitute/sting/playground/indels/IndelRecordPileCollector.java
+++ b/java/src/org/broadinstitute/sting/playground/indels/IndelRecordPileCollector.java
@ -65,26 +65,26 @@ import net.sf.samtools.*;
 */
 public class IndelRecordPileCollector implements RecordReceiver {

-	private final int WAIT_STATE = 0;
-	private final int ACTIVE_STATE = 1;
-
-	private boolean avoiding_region; // some regions are too funky (contain very long indel trains)-
-                                     // we will determine their span and report them,
+    private final int WAIT_STATE = 0;
+    private final int ACTIVE_STATE = 1;
+    
+    private boolean avoiding_region; // some regions are too funky (contain very long indel trains)-
+    // we will determine their span and report them,
                                     // but we won't be counting any indels there or building piles

-	private List<SAMRecord> mRecordPile; // here we keep the records before we decide how we want to emit them
-	private TreeSet<CountedObject<Indel> > mAllIndels; ///< individual indels encountered, with observation counts
-	private int mLastContig ;    ///< keeps the index of the contig last alignment was on
-	private int mLastStartOnRef; ///< keeps the start position of the last alignment
-	private int mState; ///< WAIT_STATE or ACTIVE_STATE
-	private int mIndelSeparation; ///< Indels that are farther away from one another than this value
-	                               ///< will be emitted separately; trains of indels with less then
-		                           ///< mIndelSeparation bases between each adjacent pair will be emitted 
-		                           ///< as one pile.
+    private List<SAMRecord> mRecordPile; // here we keep the records before we decide how we want to emit them
+    private TreeSet<CountedObject<Indel> > mAllIndels; ///< individual indels encountered, with observation counts
+    private int mLastContig ;    ///< keeps the index of the contig last alignment was on
+    private int mLastStartOnRef; ///< keeps the start position of the last alignment
+    private int mState; ///< WAIT_STATE or ACTIVE_STATE
+    private int mIndelSeparation; ///< Indels that are farther away from one another than this value
+    ///< will be emitted separately; trains of indels with less then
+    ///< mIndelSeparation bases between each adjacent pair will be emitted 
+    ///< as one pile.
 	
-	// we will build histograms (distributions) of encountered indel lengths on the fly
-	private List<Integer> mIndelLengthHistI;
-	private List<Integer> mIndelLengthHistD;
+    // we will build histograms (distributions) of encountered indel lengths on the fly
+    private List<Integer> mIndelLengthHistI;
+    private List<Integer> mIndelLengthHistD;

    private RecordReceiver defaultReceiver; // we will send there records that do not overlap with regions of interest
    private RecordPileReceiver indelPileReceiver; // piles over indel regions will be sent there
@ -97,27 +97,27 @@ public class IndelRecordPileCollector implements RecordReceiver {
 		String s = "mRecordPile: ";
 		return s+mRecordPile.size() + " mAllIndels: "+mAllIndels.size() + " mLastContig=" +mLastContig + " mLastStartOnref="+mLastStartOnRef;
                //+" Bndries="+mIndelRegionStart +":"+ mIndelRegionStop;
-	}
+    }
 	

-	public IndelRecordPileCollector(RecordReceiver rr, RecordPileReceiver rp) throws java.io.IOException {
-		mRecordPile = new LinkedList<SAMRecord>();
-		mAllIndels = new TreeSet<CountedObject<Indel> >(
-                new CountedObjectComparatorAdapter<Indel>(new IntervalComparator()));
-		mLastContig = -1;
-		mLastStartOnRef = -1;
-		mIndelSeparation = 51;
-		mIndelLengthHistI = new ArrayList<Integer>();
-		mIndelLengthHistD = new ArrayList<Integer>();
-		for ( int i = 0 ; i < 5 ; i++ ) {
-			mIndelLengthHistI.add(0);
-			mIndelLengthHistD.add(0);
-		}
+    public IndelRecordPileCollector(RecordReceiver rr, RecordPileReceiver rp) throws java.io.IOException {
+        mRecordPile = new LinkedList<SAMRecord>();
+        mAllIndels = new TreeSet<CountedObject<Indel> >(
+              new CountedObjectComparatorAdapter<Indel>(new IntervalComparator()));
+        mLastContig = -1;
+        mLastStartOnRef = -1;
+        mIndelSeparation = 51;
+        mIndelLengthHistI = new ArrayList<Integer>();
+        mIndelLengthHistD = new ArrayList<Integer>();
+        for ( int i = 0 ; i < 5 ; i++ ) {
+            mIndelLengthHistI.add(0);
+            mIndelLengthHistD.add(0);
+        }
        defaultReceiver = rr;
        indelPileReceiver = rp;
        referenceSequence = null;
        setWaitState();
-	}
+    }

    /** Fully reinitializes wait state: clears record pile and indel list, resets flags and states.
     * Does not emit records, just clears/resets the variables.
@ -136,16 +136,16 @@ public class IndelRecordPileCollector implements RecordReceiver {
    public void setReferenceSequence(String contig) {
        referenceSequence = contig;
    }
-
-	/** A utility method: emits into nonindelReceiver and purges from the currently held SAM record pile
+    
+    /** A utility method: emits into nonindelReceiver and purges from the currently held SAM record pile
     * all the consequtive records with alignment end positions less than or equal to the specified
     * position <code>pos</code>, until the first record is encountered that does not meet this condition. Note that
     * there might be more alignments that end at or before <code>pos</code> later on in the pile, but
     * they will <i> nit</i> be emitted/removed by this method.
-	 * @param pos all leading records with alignments ending before or at this position will be purged from the pile,
+     * @param pos all leading records with alignments ending before or at this position will be purged from the pile,
     * up to the first record that does not end at or before pos.
-	 */
-	protected void purgeRecordsEndingAtOrBefore(final long pos) {
+     */
+    protected void purgeRecordsEndingAtOrBefore(final long pos) {
        Iterator<SAMRecord> i = mRecordPile.iterator();
        while ( i.hasNext() ) {
            SAMRecord r = i.next();
@ -154,13 +154,13 @@ public class IndelRecordPileCollector implements RecordReceiver {
                i.remove();
            } else break;
        }
-	}
+    }
 	
 	/** A utility method: purges from the currently held SAM record pile all the records with alignment
 	 * start positions greater than or equal to the specified position <code>pos</code>
 	 * @param pos all records with alignments starting at or after this position will be purged from the pile
 	 */
-	protected void purgeRecordsStartingAtOrAfter(final int pos) {
+    protected void purgeRecordsStartingAtOrAfter(final int pos) {
        Iterator<SAMRecord> i = mRecordPile.iterator();
        while ( i.hasNext() ) {
            SAMRecord r = i.next();
@ -169,7 +169,7 @@ public class IndelRecordPileCollector implements RecordReceiver {
                i.remove();
            } else break;
        }
-	}
+    }

    /** This is the main interface method of the collector: it receives alignments, inspects them, detects indels,
     *  updates and purges the read pile it keeps and emits alignments as needed.
@ -198,46 +198,46 @@ public class IndelRecordPileCollector implements RecordReceiver {
     */
    public void receive(final SAMRecord r) throws RuntimeException {
 		
-		if ( r.getReadUnmappedFlag() ) return; // read did not align, nothing to do
+        if ( r.getReadUnmappedFlag() ) return; // read did not align, nothing to do

        if ( controlRun ) {
            defaultReceiver.receive(r);
            return;
        }

-		int currContig = r.getReferenceIndex();
-		int currPos = r.getAlignmentStart();
+        int currContig = r.getReferenceIndex();
+        int currPos = r.getAlignmentStart();
 		
-		if ( currContig < mLastContig ) throw new RuntimeException("SAM file is not ordered by contigs");
-		if ( currContig == mLastContig && currPos < mLastStartOnRef ) throw new RuntimeException("SAM file is not ordered by start positions");
-		
-		if ( currContig > mLastContig ) {
-			// we jumped onto a new contig; emit everything we might have been building and purge the piles:
-			emit();
-		} else { // still on the same contig:
+        if ( currContig < mLastContig ) throw new RuntimeException("SAM file is not ordered by contigs");
+        if ( currContig == mLastContig && currPos < mLastStartOnRef ) throw new RuntimeException("SAM file is not ordered by start positions");
+	
+        if ( currContig > mLastContig ) {
+            // we jumped onto a new contig; emit everything we might have been building and purge the piles:
+            emit();
+        } else { // still on the same contig:

            switch (mState) {
-			            // everything ending up to currPos is guaranteed to have no overlaps with indels yet to come
-			    case WAIT_STATE: purgeRecordsEndingAtOrBefore(currPos); break;
-
-                       // next indel can start only after currPos (whether it is in the current read or in the
-                       // reads yet to come). If it is far enough from the last indel we have seen, we can emit
-                case ACTIVE_STATE: if ( currPos - mAllIndels.last().getObject().getStop() > mIndelSeparation ) emit(); break;
-                default: throw new RuntimeException("Unknown state");
+                // everything ending up to currPos is guaranteed to have no overlaps with indels yet to come
+            case WAIT_STATE: purgeRecordsEndingAtOrBefore(currPos); break;
+                
+                // next indel can start only after currPos (whether it is in the current read or in the
+                // reads yet to come). If it is far enough from the last indel we have seen, we can emit
+            case ACTIVE_STATE: if ( currPos - mAllIndels.last().getObject().getStop() > mIndelSeparation ) emit(); break;
+            default: throw new RuntimeException("Unknown state");
            }
        }

        // does nothing if alignment has no indels, otherwise adds the indels to the list and (re)sets state to 'active'
        extractIndelsAndUpdateState(r.getCigar(),currPos);

-		if ( ! avoiding_region && mAllIndels.size() > 20 ) avoiding_region = true;
+        if ( mState == ACTIVE_STATE && ( ! avoiding_region ) && ( mAllIndels.size() > 20 || mRecordPile.size() > 1000 ) ) avoiding_region = true;

-		if ( ! avoiding_region ) mRecordPile.add(r); // add new record if this is not some crazy region
+        if ( ! avoiding_region ) mRecordPile.add(r); // add new record if this is not some crazy region

-		mLastContig = currContig;
-		mLastStartOnRef = currPos;
+        mLastContig = currContig;
+        mLastStartOnRef = currPos;
 		
-	}
+    }

    /** Emits all reads from the currently held pile, cleans the pile and fully reinitializes wait state
     * (clears indel list etc).
@ -247,35 +247,37 @@ public class IndelRecordPileCollector implements RecordReceiver {
     * splits the train (and the pile) into multiple trains/piles as needed (i.e. if there are pairs of adjacent
     * indels that are not overlapped by any read), and emits the final piles of records into indelReceiver.
      */
-	private void emit() {
+    private void emit() {

-		if ( mState == WAIT_STATE || avoiding_region ) {
+        if ( mState == WAIT_STATE || avoiding_region ) {
            if ( avoiding_region )  {
                long start = mAllIndels.first().getObject().getStart();
                long stop = mAllIndels.last().getObject().getStop();
                System.out.println("Genomic region "+mLastContig+":"+ start + "-"+ stop +
-                                   " was ignored: "+mAllIndels.size() +" unique indels with average distance of "+
+                                   " was ignored: ");
+                System.out.println("   "+mAllIndels.size() +" unique indels with average distance of "+
                                   ((double)(stop - start))/((double)mAllIndels.size()-1) +
                                   " bases between indels");
+                System.out.println("   "+mRecordPile.size() +" reads collected before aborting");
            }

-			// no indels or avoiding indels in bad region: send all records to defaultReceiver and clear the pile
+            // no indels or avoiding indels in bad region: send all records to defaultReceiver and clear the pile
            for ( SAMRecord r : mRecordPile ) {
                defaultReceiver.receive(r);
            }
            setWaitState();
-			return;
-		}
-		
-		// last minute cleanup:
-		// at this stage we have all the indels collected conservatively (in a sense
-		// that they can be farther away than it is needed) - this means that there actually
-		// can be more than one pile in what we have stored. Also, we can still have gapless reads
-		// at the ends of the piles that do not really overlap with indel sites.
-		
-		if ( mAllIndels.size() == 0 ) throw new RuntimeException("Attempt to emit pile with no indels");
-
-		HistogramAsNeeded(mAllIndels);
+            return;
+        }
+        
+        // last minute cleanup:
+        // at this stage we have all the indels collected conservatively (in a sense
+        // that they can be farther away than it is needed) - this means that there actually
+        // can be more than one pile in what we have stored. Also, we can still have gapless reads
+        // at the ends of the piles that do not really overlap with indel sites.
+	
+        if ( mAllIndels.size() == 0 ) throw new RuntimeException("Attempt to emit pile with no indels");
+        
+        HistogramAsNeeded(mAllIndels);


        // indels are in a sorted map, and reads were added to the pile in the order they were received (also sorted).
@ -350,19 +352,19 @@ public class IndelRecordPileCollector implements RecordReceiver {
        }

        setWaitState();
-	}
+    }
 		
 	
-	/** Looks for indels in the cigar and, if finds any, updates list of indels in the current train ans sets
+    /** Looks for indels in the cigar and, if finds any, updates list of indels in the current train ans sets
     * the state to 'active'. If cigar contains no indels, this method does not do anything (it does <i>not</i>
     * set state back to 'wait' either!). If this method finds any indels in the cigar, it first tries to find them
     * in the list of previously seen indels. If the indel was already seen before, its counter is updated (indels
     * are stored in the list as counted objects), oherwise indel is added to the list with initial count of 1.
-	 *
-	 * @param c alignment cigar; if it contains no indels, nothing will be done
-	 * @param start position, at which the alignment represented by cigar <code>c</code> starts on the reference
-	 */
-	private void extractIndelsAndUpdateState(final Cigar c, final int start) {
+     *
+     * @param c alignment cigar; if it contains no indels, nothing will be done
+     * @param start position, at which the alignment represented by cigar <code>c</code> starts on the reference
+     */
+    private void extractIndelsAndUpdateState(final Cigar c, final int start) {
 		//
 		// firstpos,lastpos span of the indel will be interpreted as follows:
 		// any alignment that ends strictly before firstpos or starts strictly after lastpos
@ -387,12 +389,12 @@ public class IndelRecordPileCollector implements RecordReceiver {

        if ( c.numCigarElements() == 1 ) return; // most of the reads have no indels, save a few cycles by returning early

-		for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
+        for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
 			
-			final CigarElement ce = c.getCigarElement(i);
+            final CigarElement ce = c.getCigarElement(i);
            Indel indel = null;

-			switch(ce.getOperator()) {
+            switch(ce.getOperator()) {
    		case I: indel = new Indel(runninglength, ce.getLength(), IndelType.I); break;
    		case D: indel = new Indel(runninglength, ce.getLength(), IndelType.D);
    				runninglength += ce.getLength();
@ -400,9 +402,9 @@ public class IndelRecordPileCollector implements RecordReceiver {
    		case M: runninglength += ce.getLength(); break; // advance along the gapless block in the alignment
    		default :
    			throw new IllegalArgumentException("Unexpected operator in cigar string");
-			}
+            }

-			if ( indel == null ) continue; // element was not an indel, go grab next element...
+            if ( indel == null ) continue; // element was not an indel, go grab next element...

            mState = ACTIVE_STATE; // this is redundant and will be executed unnecessarily many times, but it's cheap...

@ -411,82 +413,82 @@ public class IndelRecordPileCollector implements RecordReceiver {
            
            if ( indelWithCount.equals( found ) ) found.increment(); // we did find our indel, advance the counter
            else mAllIndels.add(indelWithCount); // this is a new indel. Add it.
-		} // end for loop over all alignment cigar elements
+        } // end for loop over all alignment cigar elements
 		
-	} // end extractIndels() method
+    } // end extractIndels() method



-	/** Counts the size of the passed <indel> argument into the appropriate size histogram 
-	 * 
-	 * @param indel size of this indel will be counted in
-	 */
-	private void addToSizeHistogram(Indel indel) {
-		// count this indel's size into the appropriate bin of the appropriate histogram
-		// (we count insertions and deletions separately), resizing the histogram array if needed:
-		List<Integer> histogram;
-		if ( indel.getType() == Indel.IndelType.D ) {
-			histogram = mIndelLengthHistD; 
-		} else if ( indel.getType() == Indel.IndelType.I ) {
-			histogram = mIndelLengthHistI; 
-		} else {
-			throw new RuntimeException("Indel of unknown type");
-		}
-		if( indel.getIndelLength() > histogram.size() ) {
-			for ( int j = histogram.size() ; j < indel.getIndelLength() ; j++ ) histogram.add(0);
-			histogram.set((int)indel.getIndelLength()-1, 1); // we are seeing this length for the first time, so count == 1
-		} else {
-			int n = histogram.get((int)indel.getIndelLength()-1);
-			histogram.set((int)indel.getIndelLength()-1, n+1);
-		}		
-	}
+    /** Counts the size of the passed <indel> argument into the appropriate size histogram 
+     * 
+     * @param indel size of this indel will be counted in
+     */
+    private void addToSizeHistogram(Indel indel) {
+        // count this indel's size into the appropriate bin of the appropriate histogram
+        // (we count insertions and deletions separately), resizing the histogram array if needed:
+        List<Integer> histogram;
+        if ( indel.getType() == Indel.IndelType.D ) {
+            histogram = mIndelLengthHistD; 
+        } else if ( indel.getType() == Indel.IndelType.I ) {
+            histogram = mIndelLengthHistI; 
+        } else {
+            throw new RuntimeException("Indel of unknown type");
+        }
+        if( indel.getIndelLength() > histogram.size() ) {
+            for ( int j = histogram.size() ; j < indel.getIndelLength() ; j++ ) histogram.add(0);
+            histogram.set((int)indel.getIndelLength()-1, 1); // we are seeing this length for the first time, so count == 1
+        } else {
+            int n = histogram.get((int)indel.getIndelLength()-1);
+            histogram.set((int)indel.getIndelLength()-1, n+1);
+        }		
+    }
 	
-	/** Adds sizes of the indels from the list that pass some filters to the histograms
-	 * 
-	 * @param indels collection of indels with counts
-	 */
-	private void HistogramAsNeeded(Collection<CountedObject<Indel>> indels) {
-		for ( CountedObject<Indel> o : indels ) {
-			if ( o.getCount() >= 2 ) addToSizeHistogram(o.getObject());
-		}
-	}
+    /** Adds sizes of the indels from the list that pass some filters to the histograms
+     * 
+     * @param indels collection of indels with counts
+     */
+    private void HistogramAsNeeded(Collection<CountedObject<Indel>> indels) {
+        for ( CountedObject<Indel> o : indels ) {
+            if ( o.getCount() >= 2 ) addToSizeHistogram(o.getObject());
+        }
+    }
 	
-	/** Retruns true if the indel run has to be printed into output; currently, indel run is acceptable
-	 * if it contains at least one indel onbserved more than once.
-	 * @param indels list of indels with counts to check for being acceptable
-	 * @return true if the indel run has to be printed
-	 */
-	private boolean shouldAcceptForOutput(List<CountedObject<Indel>> indels) {
-		for ( CountedObject<Indel> o :  indels ) {
-			if ( o.getCount() >= 2 ) return true;
-		}
-		return false;
-	}
-
-	private String formatRange(List<CountedObject<Indel>> indels) {
-		StringBuffer b = new StringBuffer();
-		StringBuffer all = new StringBuffer();
-		
-		long min = 1000000000;
-		long max = 0;
+    /** Retruns true if the indel run has to be printed into output; currently, indel run is acceptable
+     * if it contains at least one indel onbserved more than once.
+     * @param indels list of indels with counts to check for being acceptable
+     * @return true if the indel run has to be printed
+     */
+    private boolean shouldAcceptForOutput(List<CountedObject<Indel>> indels) {
+        for ( CountedObject<Indel> o :  indels ) {
+            if ( o.getCount() >= 2 ) return true;
+        }
+        return false;
+    }

+    private String formatRange(List<CountedObject<Indel>> indels) {
+        StringBuffer b = new StringBuffer();
+        StringBuffer all = new StringBuffer();
+	
+        long min = 1000000000;
+        long max = 0;
+        
        all.append("; passing indels:");
-		for ( CountedObject<Indel> o :  indels ) {
-			if ( o.getCount() < 2 ) continue;
+        for ( CountedObject<Indel> o :  indels ) {
+            if ( o.getCount() < 2 ) continue;
            all.append(" ");
            all.append(o.getObject().getIndelLength());
-			if ( o.getObject().getIndelLength() < min ) min = o.getObject().getIndelLength();
-			if ( o.getObject().getIndelLength() > max ) max = o.getObject().getIndelLength();
-		}
+            if ( o.getObject().getIndelLength() < min ) min = o.getObject().getIndelLength();
+            if ( o.getObject().getIndelLength() > max ) max = o.getObject().getIndelLength();
+        }
        if ( max == 0 ) return new String(); // no passinf indels, return empty string
-
-		b.append(" passing min length: ");
-		b.append(min);
-		b.append("; passing max length: ");
-		b.append(max);
-		b.append(all);
-		return b.toString();
-	}
+        
+        b.append(" passing min length: ");
+        b.append(min);
+        b.append("; passing max length: ");
+        b.append(max);
+        b.append(all);
+        return b.toString();
+    }

    public void printLengthHistograms() {
        if ( mIndelLengthHistD.size() < mIndelLengthHistI.size() ) {
--- a/java/src/org/broadinstitute/sting/playground/utils/TrioConcordanceRecord.java
+++ b/java/src/org/broadinstitute/sting/playground/utils/TrioConcordanceRecord.java
@ -19,6 +19,15 @@ public class TrioConcordanceRecord {
 	public int inconsistent_indels_in_parent  = 0 ;
 	public int inconsistent_indels_in_kid  = 0 ;
 	public int non_biallelic = 0; // number of variant calls that are not biallelic
+	public long mom_assessed = 0; // number of assessed loci for mother (i.e. passing confidence threshold filter)
+	public long dad_assessed = 0;
+	public long kid_assessed = 0;
+	public long mom_ref = 0; // number of reference calls (out of total assessed)
+	public long dad_ref = 0;
+	public long kid_ref = 0;
+	public long mom_snp = 0; // number of snp calls (out of total assessed)
+	public long dad_snp = 0;
+	public long kid_snp = 0;
 	
 	public TrioConcordanceRecord add(TrioConcordanceRecord other) {
 		this.assessed_loci += other.assessed_loci;
@ -32,6 +41,15 @@ public class TrioConcordanceRecord {
 		this.inconsistent_indels_in_parent += other.inconsistent_indels_in_parent;
 		this.inconsistent_indels_in_kid += other.inconsistent_indels_in_kid;
 		this.non_biallelic += other.non_biallelic;
+		this.mom_assessed += other.mom_assessed;
+		this.dad_assessed += other.dad_assessed;
+		this.kid_assessed += other.kid_assessed;
+		this.mom_ref += other.mom_ref;
+		this.dad_ref += other.dad_ref;
+		this.kid_ref += other.kid_ref;
+		this.mom_snp += other.mom_snp;
+		this.dad_snp += other.dad_snp;
+		this.kid_snp += other.kid_snp;
 		
 		return this;
 	}
@ -39,8 +57,11 @@ public class TrioConcordanceRecord {
 	public int totalSNP() { return consistent_snp + inconsistent_snp + non_biallelic; }
 	
 	public String toString() {
-		return String.format("assessed: %d; reference: %d (%3.2f);  total snp: %d;  consistent snp: %d (%3.2f); multiallelic: %d (%3.2f); " , 
+		return String.format("%ntotal assessed in trio: %d%n   reference: %d (%3.2f)%n   total snp: %d%n      consistent snp: %d (%3.2f)%n      multiallelic: %d (%3.2f)%nper trio individual:%n   assessed:%n      mother: %d%n      father: %d%n      daughter: %d%n" , 
 				assessed_loci, consistent_ref, ((double)consistent_ref*100.00)/assessed_loci,totalSNP(), consistent_snp, ((double)consistent_snp*100.0)/totalSNP(),
-				non_biallelic, ((double)non_biallelic*100.0)/totalSNP());
+				non_biallelic, ((double)non_biallelic*100.0)/totalSNP(),mom_assessed,dad_assessed,kid_assessed);
+//		return String.format("total assessed in trio: %d%n   reference: %d (%3.2f)%n   total snp: %d%n      consistent snp: %d (%3.2f)%n      multiallelic: %d (%3.2f)" , 
+//				assessed_loci, consistent_ref, ((double)consistent_ref*100.00)/assessed_loci,totalSNP(), consistent_snp, ((double)consistent_snp*100.0)/totalSNP(),
+//				non_biallelic, ((double)non_biallelic*100.0)/totalSNP());
 	}
 }