Merge pull request #468 from broadinstitute/gg_fixSAMPileup

Updated SAMPileup codec and pileup-related docs
2014-01-14 06:30:04 -08:00 · 2014-01-14 06:30:04 -08:00 · f67c33919b
parent 16ecc53749 edf5880022
commit f67c33919b
5 changed files with 318 additions and 100 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java
@ -49,20 +49,109 @@ import java.io.PrintStream;
 import java.util.Arrays;

 /**
- * At every locus in the input set, compares the pileup data (reference base, aligned base from
- * each overlapping read, and quality score) to the reference pileup data generated by samtools.  Samtools' pileup data
- * should be specified using the command-line argument '-pileup:SAMPileup <your sam pileup file>'.
+ * Compare GATK's internal pileup to a reference Samtools pileup
+ *
+ * <p>At every locus in the input set, compares the pileup data (reference base, aligned base from
+ * each overlapping read, and quality score) generated internally by GATK to a reference pileup data generated
+ * by Samtools. Note that the pileup program has been replaced in Samtools by mpileup, which produces a slightly
+ * different output format by default.
+ * </p>
+ *
+ * <h3>Format</h3>
+ * <p>There are two versions of the original pileup format: the current 6-column format produced by Samtools, and the old
+ * 10-column "consensus" format which could be obtained by using the -c argument, now deprecated.</p>
+ * <h4>Simple pileup: 6-column format</h4>
+ * <p>
+ *     Each line consists of chromosome, 1-based coordinate, reference base, the
+ *     number of reads covering the site, read bases and base qualities. At the
+ *     read base column, a dot stands for a match to the reference base on the
+ *     forward strand, a comma for a match on the reverse strand, `ACGTN' for a mismatch
+ *     on the forward strand and `acgtn' for a mismatch on the reverse strand.
+ *     A pattern `\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between
+ *     this reference position and the next reference position. The length of the
+ *     insertion is given by the integer in the pattern, followed by the inserted sequence.
+ * </p>
+ * <pre>
+ *     seq1 272 T 24  ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<&
+ *     seq1 273 T 23  ,.....,,.,.,...,,,.,..A <<<;<<<<<<<<<3<=<<<;<<+
+ *     seq1 274 T 23  ,.$....,,.,.,...,,,.,...    7<7;<;<<<<<<<<<=<;<;<<6
+ *     seq1 275 A 23  ,$....,,.,.,...,,,.,...^l.  <+;9*<<<<<<<<<=<<:;<<<<
+ *     seq1 276 G 22  ...T,,.,.,...,,,.,....  33;+<<7=7<<7<&<<1;<<6<
+ *     seq1 277 T 22  ....,,.,.,.C.,,,.,..G.  +7<;<<<<<<<&<=<<:;<<&<
+ *     seq1 278 G 23  ....,,.,.,...,,,.,....^k.   %38*<<;<7<<7<=<<<;<<<<<
+ *     seq1 279 C 23  A..T,,.,.,...,,,.,..... ;75&<<<<<<<<<=<<<9<<:<<
+ * </pre>
+ * <p>
+ *     See the <a href="http://samtools.sourceforge.net/pileup.shtml">Pileup format documentation</a> for more details.
+ * </p>
+ *
+ * <h4>Consensus pileup: 10/13-column format</h4>
+ * <p>The "consensus" or extended pileup consists of the following:
+ *  <ul>
+ *      <li>original 6 columns as described above</li>
+ *      <li>4 extra columns representing consensus values (consensus base, consensus quality, variant quality and maximum mapping quality of the
+ * reads covering the sites) for all sites, inserted before the bases and quality strings</li>
+ *      <li>3 extra columns indicating counts of reads supporting indels (just for indel sites)</li>
+ *  </ul>
+ * </p>
+ * <h4>Example of consensus pileup for SNP or non-variant sites</h4>
+ * <pre>
+ *     seq1  60  T  T  66  0  99  13  ...........^~.^~.   9<<55<;<<<<<<
+ *     seq1  61  G  G  72  0  99  15  .............^~.^y. (;975&;<<<<<<<<
+ *     seq1  62  T  T  72  0  99  15  .$..............    <;;,55;<<<<<<<<
+ *     seq1  63  G  G  72  0  99  15  .$.............^~.  4;2;<7:+<<<<<<<
+ *     seq1  64  G  G  69  0  99  14  ..............  9+5<;;;<<<<<<<
+ *     seq1  65  A  A  69  0  99  14  .$............. <5-2<;;<<<<<<;
+ *     seq1  66  C  C  66  0  99  13  .............   &*<;;<<<<<<8<
+ *     seq1  67  C  C  69  0  99  14  .............^~.    ,75<.4<<<<<-<<
+ *     seq1  68  C  C  69  0  99  14  ..............  576<;7<<<<<8<< *
+ * </pre>
+ *
+ * <h4>Example of consensus pileup for indels</h4>
+ * <pre>
+ *     Escherichia_coli_K12	3995037	*	*\/*	430	0	37	144	*	+A	143	1	0
+ *     Escherichia_coli_K12	3995279	*	*\/*	202	0	36	68	*	+A	67	1	0
+ *     Escherichia_coli_K12	3995281	*	*\/*	239	0	36	67	*	-CG	66	1	0
+ * </pre>
+ * <p>
+ *     See <a href="http://samtools.sourceforge.net/cns0.shtml/">Consensus pileup format (deprecated)</a> for more details.
+ * </p>
+ *
+ * <h3>Input</h3>
+ * <p>A BAM file conatining your aligned sequence data and a pileup file generated by Samtools covering the region you
+ * want to examine.</p>
+ *
+ * <h3>Output</h3>
+ * <p>A text file listing mismatches between the input pileup and the GATK's internal pileup. If there are no mismatches, the output file is empty.</p>
+ *
+ * <h3>Example</h3>
+ * <pre>
+ * java -jar GenomeAnalysisTK.jar \
+ *   -T CheckPileup \
+ *   -R ref.fasta \
+ *   -I your_data.bam \
+ *   --pileup:SAMPileup pileup_file.txt \
+ *   -L chr1:257-275 \
+ *   -o output_file_name
+ * </pre>
 */
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
@Requires(value={DataSource.READS,DataSource.REFERENCE})
 public class CheckPileup extends LocusWalker<Integer, CheckPileupStats> implements TreeReducible<CheckPileupStats> {
-    @Input(fullName = "pileup", doc="The SAMPileup containing the expected output", required = true)
+    /**
+     * This is the existing pileup against which we'll compare GATK's internal pileup at each genome position in the desired interval.
+     */
+    @Input(fullName = "pileup", shortName = "pileup", doc="Pileup generated by Samtools", required = true)
    RodBinding<SAMPileupFeature> pileup;

    @Output
    private PrintStream out;
-
-    @Argument(fullName="continue_after_error",doc="Continue after an error",required=false)
+    /**
+     * By default the program will quit if it encounters an error (such as missing truth data for a given position).
+     * Use this flag to override the default behavior; the program will then simply print an error message and move on
+     * to the next position.
+     */
+    @Argument(fullName="continue_after_error",doc="Continue after encountering an error",required=false)
    public boolean CONTINUE_AFTER_AN_ERROR = false;

    public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
@ -72,7 +161,7 @@ public class CheckPileup extends LocusWalker<Integer, CheckPileupStats> implemen
        if ( truePileup == null ) {
            out.printf("No truth pileup data available at %s%n", pileup.getPileupString(ref.getBaseAsChar()));
            if ( ! CONTINUE_AFTER_AN_ERROR ) {
-                throw new UserException.CommandLineException(String.format("No pileup data available at %s given GATK's output of %s -- this walker requires samtools pileup data over all bases",
+                throw new UserException.BadInput(String.format("No pileup data available at %s given GATK's output of %s -- this walker requires samtools pileup data over all bases",
                        context.getLocation(), new String(pileup.getBases())));
            }
        } else {
@ -80,7 +169,7 @@ public class CheckPileup extends LocusWalker<Integer, CheckPileupStats> implemen
            if ( pileupDiff != null ) {
                out.printf("%s vs. %s%n", pileup.getPileupString(ref.getBaseAsChar()), truePileup.getPileupString());
                if ( ! CONTINUE_AFTER_AN_ERROR ) {
-                    throw new RuntimeException(String.format("Pileups aren't equal: %s", pileupDiff));
+                    throw new UserException.BadInput(String.format("The input pileup doesn't match the GATK's internal pileup: %s", pileupDiff));
                }
            }
        }
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java
@ -48,11 +48,17 @@ import java.util.List;
 /**
 * Emulates the samtools pileup command to print aligned reads
 *
- * <p>Prints the alignment in something similar to the samtools pileup format.  Each line represents a genomic position,
- * consisting of chromosome name, coordinate, reference base, read bases, and read qualities.
+ * <p>Prints the alignment in something similar to the Samtools pileup format (see the
+ * <a href="http://samtools.sourceforge.net/pileup.shtml">Pileup format documentation</a> for more details about
+ * the original format). There is one line per genomic position, listing the chromosome name, coordinate, reference
+ * base, read bases, and read qualities. In addition to these default fields, additional information can be added to
+ * the output as extra columns; see options detailed below.</p>
 *
- * Emulated command:
- * samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list] [-iscg] [-T theta] [-N nHap] [-r pairDiffRate] <in.alignment>
+ * <h4>Emulated command:</h4>
+ * <pre>
+ *  samtools pileup -f in.ref.fasta -l in.site_list input.bam
+ * </pre>
+
 *
 * <h3>Input</h3>
 * <p>
@ -61,17 +67,32 @@ import java.util.List;
 *
 * <h3>Output</h3>
 * <p>
- * Formatted pileup-style alignment of reads.
+ *  Alignment of reads formatted in the Pileup style.
 * </p>
 *
 * <h3>Example</h3>
 * <pre>
 * java -Xmx2g -jar GenomeAnalysisTK.jar \
 *   -T Pileup \
- *   -R ref.fasta \
- *   -I aligned_reads.bam \
+ *   -R exampleFASTA.fasta \
+ *   -I exampleBAM.bam \
+ *   -L chr1:257-267
 *   -o output.txt
 * </pre>
+ * <h4>Expected output</h4>
+ * <pre>
+ *     chr1 257 A CAA '&=
+ *     chr1 258 C TCC A:=
+ *     chr1 259 C CCC )A=
+ *     chr1 260 C ACC (=<
+ *     chr1 261 T TCT '44
+ *     chr1 262 A AAA '?:
+ *     chr1 263 A AGA 1'6
+ *     chr1 264 C TCC 987
+ *     chr1 265 C CCC (@(
+ *     chr1 266 C GCC ''=
+ *     chr1 267 T AAT 7%>
+ * </pre>
 *
 */
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
@ -83,18 +104,25 @@ public class Pileup extends LocusWalker<String, Integer> implements TreeReducibl
    PrintStream out;

    /**
-     * In addition to the standard pileup output, adds 'verbose' output too.  The verbose output contains the number of spanning deletions,
+     * In addition to the standard pileup output, adds 'verbose' output too. The verbose output contains the number of spanning deletions,
     * and for each read in the pileup it has the read name, offset in the base string, read length, and read mapping quality.  These per
     * read items are delimited with an '@' character.
     */
    @Argument(fullName="showVerbose",shortName="verbose",doc="Add an extra verbose section to the pileup output", required=false)
    public boolean SHOW_VERBOSE = false;
-
-    @Input(fullName="metadata",shortName="metadata",doc="Add these ROD bindings to the output Pileup", required=false)
+    /**
+     * This enables annotating the pileup to show overlaps with metadata from a ROD file.
+     * For example, if you provide a VCF and there is a SNP at a given location covered by the pileup, the pileup
+     * output at that position will be annotated with the corresponding source ROD identifier.
+     */
+    @Input(fullName="metadata",shortName="metadata",doc="ROD file containing metadata", required=false)
    public List<RodBinding<Feature>> rods = Collections.emptyList();
-
+    /**
+     * Adds the length of the insert each base comes from to the output pileup. Here, "insert" refers to the DNA insert
+     * produced during library generation before sequencing.
+     */
    @Hidden
-    @Argument(fullName="outputInsertLength",shortName = "outputInsertLength",doc="Add a column which contains the length of the insert each base comes from.",required=false)
+    @Argument(fullName="outputInsertLength",shortName = "outputInsertLength",doc="Output insert length",required=false)
    public boolean outputInsertLength=false;

    @Override
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java
@ -37,13 +37,21 @@ import java.util.regex.Pattern;
 import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.VariantType;

 /**
- * Decoder for SAM pileup data.  For GATK validation purposes only
+ * Decoder for SAM pileup data.
 *
 * <p>
- *     Pileup format is first used by Tony Cox and Zemin Ning at the Sanger Institute.
- *     It desribes the base-pair information at each chromosomal position. This format
- *     facilitates SNP/indel calling and brief alignment viewing by eyes.
+ *     From the <a href="http://samtools.sourceforge.net/">SAMTools project documentation</a>:
 * </p>
+ * <p>The Pileup format was first used by Tony Cox and Zemin Ning at
+ *     the Sanger Institute. It describes the base-pair information at each chromosomal position. This format
+ *     facilitates SNP/indel calling and brief alignment viewing by eye. Note that the pileup program has been replaced
+ *     in Samtools by mpileup, which produces a slightly different output format by default.
+ * </p>
+
+ * <h3>Format</h3>
+ * <p>There are two versions of the original pileup format: the current 6-column format produced by Samtools, and the old
+ * 10/13-column "consensus" format which could be obtained by using the -c argument, now deprecated. </p>
+ * <h4>Simple pileup: 6-column format</h4>
 * <p>
 *     Each line consists of chromosome, 1-based coordinate, reference base, the
 *     number of reads covering the site, read bases and base qualities. At the
@ -54,13 +62,6 @@ import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.V
 *     this reference position and the next reference position. The length of the
 *     insertion is given by the integer in the pattern, followed by the inserted sequence.
 * </p>
- *
- * <p>
- *     <br>See also: @see <a href="http://samtools.sourceforge.net/">SAMTools project</a></br>
- *     <br>See also: @see <a href="http://samtools.sourceforge.net/pileup.shtml">Pileup format</a></br>
- * </p>
- *
- * <h2>File format example</h2>
 * <pre>
 *     seq1 272 T 24  ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<&
 *     seq1 273 T 23  ,.....,,.,.,...,,,.,..A <<<;<<<<<<<<<3<=<<<;<<+
@ -71,15 +72,55 @@ import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.V
 *     seq1 278 G 23  ....,,.,.,...,,,.,....^k.   %38*<<;<7<<7<=<<<;<<<<<
 *     seq1 279 C 23  A..T,,.,.,...,,,.,..... ;75&<<<<<<<<<=<<<9<<:<<
 * </pre>
+ * <p>
+ *     See the <a href="http://samtools.sourceforge.net/pileup.shtml">Pileup format documentation</a> for more details.
+ * </p>
 *
- * @author Matt Hanna
- * @since 2009
+ * <h4>Consensus pileup: 10/13-column format</h4>
+ * <p>The "consensus" or extended pileup consists of the following:
+ *  <ul>
+ *      <li>original 6 columns as described above</li>
+ *      <li>4 extra columns representing consensus values (consensus base, consensus quality, variant quality and maximum mapping quality of the
+ * reads covering the sites) for all sites, inserted before the bases and quality strings</li>
+ *      <li>3 extra columns indicating counts of reads supporting indels (just for indel sites)</li>
+ *  </ul>
+ * </p>
+ * <h4>Example of consensus pileup for SNP or non-variant sites</h4>
+ * <pre>
+ *     seq1  60  T  T  66  0  99  13  ...........^~.^~.   9<<55<;<<<<<<
+ *     seq1  61  G  G  72  0  99  15  .............^~.^y. (;975&;<<<<<<<<
+ *     seq1  62  T  T  72  0  99  15  .$..............    <;;,55;<<<<<<<<
+ *     seq1  63  G  G  72  0  99  15  .$.............^~.  4;2;<7:+<<<<<<<
+ *     seq1  64  G  G  69  0  99  14  ..............  9+5<;;;<<<<<<<
+ *     seq1  65  A  A  69  0  99  14  .$............. <5-2<;;<<<<<<;
+ *     seq1  66  C  C  66  0  99  13  .............   &*<;;<<<<<<8<
+ *     seq1  67  C  C  69  0  99  14  .............^~.    ,75<.4<<<<<-<<
+ *     seq1  68  C  C  69  0  99  14  ..............  576<;7<<<<<8<< *
+ * </pre>
+ *
+ * <h4>Example of consensus pileup for indels</h4>
+ * <pre>
+ *     Escherichia_coli_K12	3995037	*	*\/*	430	0	37	144	*	+A	143	1	0
+ *     Escherichia_coli_K12	3995279	*	*\/*	202	0	36	68	*	+A	67	1	0
+ *     Escherichia_coli_K12	3995281	*	*\/*	239	0	36	67	*	-CG	66	1	0
+ * </pre>
+ * <p>
+ *     See <a href="http://samtools.sourceforge.net/cns0.shtml/">Consensus pileup format (deprecated)</a> for more details.
+ * </p>
+ *
+ * <h3>Caveat</h3>
+ * <p>Handling of indels is questionable at the moment. Proceed with care.</p>
+ *
+ *
+ * @author Matt Hanna, Geraldine VdAuwera
+ * @since 2014
 */
 public class SAMPileupCodec extends AsciiFeatureCodec<SAMPileupFeature> {
-    // the number of tokens we expect to parse from a pileup line
-    private static final int expectedTokenCount = 10;
+    // number of tokens expected (6 or 10 are valid, anything else is wrong)
+    private static final int basicTokenCount = 6;
+    private static final int consensusSNPTokenCount = 10;
+    private static final int consensusIndelTokenCount = 13;
    private static final char fldDelim = '\t';
-
    // allocate once and don't ever bother creating them again:
    private static final String baseA = "A";
    private static final String baseC = "C";
@ -92,74 +133,110 @@ public class SAMPileupCodec extends AsciiFeatureCodec<SAMPileupFeature> {
    }

    public SAMPileupFeature decode(String line) {
-//       0          1             2         3                  4         5            6         7
-//*     chrX     466           T         Y                170      170       88      32 ... (piles of read bases  and quals follow)
-//*    chrX    141444     *     +CA/+CA       32       468     255     25     +CA     *       5       2       12      6
-        String[] tokens = new String[expectedTokenCount];
+        //+1 because we want to know if we have more than the max
+        String[] tokens = new String[consensusIndelTokenCount+1];

        // split the line
-        int count = ParsingUtils.split(line,tokens,fldDelim);
-
-        // check to see if we've parsed the string into the right number of tokens (expectedTokenCount)
-        if (count != expectedTokenCount)
-            throw new CodecLineParsingException("the SAM pileup line didn't have the expected number of tokens " +
-                                                "(expected = " + expectedTokenCount + ", saw = " + count + " on " +
-                                                "line = " + line + ")");
+        final int count = ParsingUtils.split(line,tokens,fldDelim);

        SAMPileupFeature feature = new SAMPileupFeature();

+        /**
+         * Tokens 0, 1, 2 are the same for both formats so they will be interpreted without differentiation.
+         * The 10/13-format has 4 tokens inserted after token 2 compared to the 6-format, plus 3 more tokens added at
+         * the end for indels. We are currently not making any use of the extra indel tokens.
+         *
+         * Any token count other than basicTokenCount, consensusSNPTokenCount or consensusIndelTokenCount is wrong.
+         */
+        final String observedString, bases, quals;
+
        feature.setChr(tokens[0]);
        feature.setStart(Integer.parseInt(tokens[1]));

-        if(tokens[2].length() != 1)
+        if(tokens[2].length() != 1)  {
            throw new CodecLineParsingException("The SAM pileup line had unexpected base " + tokens[2] + " on line = " + line);
-        feature.setRef(Character.toUpperCase(tokens[2].charAt(0)));
-
-        String observedString = tokens[3].toUpperCase(); // field 3
-        feature.setFWDAlleles(new ArrayList<String>(2));
-
-        feature.setConsensusConfidence(Double.parseDouble(tokens[4]));
-        feature.setVariantConfidence(Double.parseDouble(tokens[5]));
-
-        if ( feature.getRef() == '*' ) {
-            parseIndels(observedString,feature) ;
-            if ( feature.isDeletion() ) feature.setEnd(feature.getStart()+feature.length()-1);
-            else feature.setEnd(feature.getStart()); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!!
-        } else {
-            parseBasesAndQuals(feature,tokens[8],tokens[9]);
-            // if the variant is a SNP or a reference base (i.e. no variant at all)
-            if ( observedString.length() != 1 ) throw new RuntimeException( "point mutation genotype is expected to be represented by a single letter");
-            feature.setRefBases(tokens[2].toUpperCase());
-            feature.setEnd(feature.getStart());
-
-            char ch = observedString.charAt(0);
-
-            switch ( ch ) {
-                case 'A': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseA); break;
-                case 'C': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseC); break;
-                case 'G': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseG); break;
-                case 'T': feature.getFWDAlleles().add(baseT); feature.getFWDAlleles().add(baseT); break;
-                case 'M': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseC); break;
-                case 'R': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseG); break;
-                case 'W': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseT); break;
-                case 'S': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseG); break;
-                case 'Y': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseT); break;
-                case 'K': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseT); break;
-            }
-            if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() && feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ) feature.setVariantType(VariantType.NONE);
-            else {
-                // 	we know that at least one allele is non-ref;
-                // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e.
-                // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref)
-                feature.setVariantType(VariantType.SNP);
-                if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() ||
-                        feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ||
-                        feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1))
-                        ) feature.setNumNonRef(1);
-                else feature.setNumNonRef(2); // if both observations differ from ref and they are not equal to one another, then we get multiallelic site...
-            }
        }
+        feature.setRef(tokens[2].charAt(0));

+        switch (count) {
+            case basicTokenCount:
+                bases = tokens[4];
+                quals = tokens[5];
+                // parsing is pretty straightforward for 6-col format
+                if ( feature.getRef() == '*' ) {   // this indicates an indel -- but it shouldn't occur with vanilla 6-col format
+                    throw new CodecLineParsingException("Found an indel on line = " + line + " but it shouldn't happen in simple pileup format");
+                } else {
+                    parseBasesAndQuals(feature, bases, quals);
+                    feature.setRefBases(tokens[2].toUpperCase());
+                    feature.setEnd(feature.getStart());
+                }
+                break;
+            case consensusSNPTokenCount: // pileup called a SNP or a reference base
+                observedString = tokens[3].toUpperCase();
+                feature.setFWDAlleles(new ArrayList<String>(2));
+                feature.setConsensusConfidence(Double.parseDouble(tokens[4]));
+                feature.setVariantConfidence(Double.parseDouble(tokens[5]));
+                bases = tokens[8];
+                quals = tokens[9];
+                // confirm that we have a non-variant, not a mis-parsed indel
+                if ( feature.getRef() == '*' ) {
+                    throw new CodecLineParsingException("Line parsing of " + line + " says we have a SNP or non-variant but the ref base is '*', which indicates an indel");
+                }
+                // Parse the SNP or non-variant
+                parseBasesAndQuals(feature, bases, quals);
+                if ( observedString.length() != 1 ) {
+                    throw new CodecLineParsingException( "Line parsing of " + line + " says we have a SNP or non-variant but the genotype token is not a single letter: " + observedString);
+                }
+                feature.setRefBases(tokens[2].toUpperCase());
+                feature.setEnd(feature.getStart());
+
+                char ch = observedString.charAt(0);
+
+                switch ( ch ) {  // record alleles (decompose ambiguous base codes)
+                    case 'A': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseA); break;
+                    case 'C': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseC); break;
+                    case 'G': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseG); break;
+                    case 'T': feature.getFWDAlleles().add(baseT); feature.getFWDAlleles().add(baseT); break;
+                    case 'M': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseC); break;
+                    case 'R': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseG); break;
+                    case 'W': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseT); break;
+                    case 'S': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseG); break;
+                    case 'Y': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseT); break;
+                    case 'K': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseT); break;
+                }
+                if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() && feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ) feature.setVariantType(VariantType.NONE);
+                else {
+                    // 	we know that at least one allele is non-ref;
+                    // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e.
+                    // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref)
+                    feature.setVariantType(VariantType.SNP);
+                    if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() ||
+                            feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ||
+                            feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1))
+                            ) feature.setNumNonRef(1);
+                    else feature.setNumNonRef(2); // if both observations differ from ref and they are not equal to one another, then we get multiallelic site...
+                }
+                break;
+            case consensusIndelTokenCount:
+                observedString = tokens[3].toUpperCase();
+                feature.setFWDAlleles(new ArrayList<String>(2));
+                feature.setConsensusConfidence(Double.parseDouble(tokens[4]));
+                feature.setVariantConfidence(Double.parseDouble(tokens[5]));
+                // confirm that we have an indel, not a mis-parsed SNP or non-variant
+                if ( feature.getRef() != '*' ) {
+                    throw new CodecLineParsingException("Line parsing of " + line + " says we have an indel but the ref base is not '*'");
+                }
+                // Parse the indel
+                parseIndels(observedString,feature) ;
+                if ( feature.isDeletion() ) feature.setEnd(feature.getStart()+feature.length()-1);
+                else feature.setEnd(feature.getStart()); // if it's not a deletion and we are biallelic, this has got to be an insertion; otherwise the state is inconsistent!!!!
+                break;
+            default:
+                throw new CodecLineParsingException("The SAM pileup line didn't have the expected number of tokens " +
+                    "(expected = " + basicTokenCount + " (basic pileup), " + consensusSNPTokenCount +
+                    " (consensus pileup for a SNP or non-variant site) or " + consensusIndelTokenCount +
+                    " (consensus pileup for an indel); saw = " + count + " on line = " + line + ")");
+        }
        return feature;
    }

@ -197,7 +274,7 @@ public class SAMPileupCodec extends AsciiFeatureCodec<SAMPileupFeature> {
                    else feature.setVariantType(VariantType.DELETION);
                    feature.setRefBases(varBases); // remember what was deleted, this will be saved as "reference allele"
                    break;
-                default: throw new RuntimeException("Can not interpret observed indel allele record: "+genotype);
+                default: throw new CodecLineParsingException("Can not interpret observed indel allele record: "+genotype);
            }
            feature.getFWDAlleles().add(varBases);
            feature.setLength(obs[i].length()-1); // inconsistent for non-biallelic indels!!
@ -224,7 +301,7 @@ public class SAMPileupCodec extends AsciiFeatureCodec<SAMPileupFeature> {
    {
        //System.out.printf("%s%n%s%n", bases, quals);

-        // needs to convert the base string with it's . and , to the ref base
+        // needs to convert the base string with its . and , to the ref base
        StringBuilder baseBuilder = new StringBuilder();
        StringBuilder qualBuilder = new StringBuilder();
        boolean done = false;
@ -254,7 +331,7 @@ public class SAMPileupCodec extends AsciiFeatureCodec<SAMPileupFeature> {
                    Matcher match = regex.matcher(rest);
                    if ( ! match.matches() ) {
                        if ( feature.getRef() != '*' )
-                            throw new RuntimeException("Bad pileup format: " + bases + " at position " + i);
+                            throw new CodecLineParsingException("Bad pileup format: " + bases + " at position " + i);
                        done = true;
                    }
                    else {
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java
@ -33,6 +33,10 @@ import java.util.List;
 /**
 * A tribble feature representing a SAM pileup.
 *
+ * Allows intake of both simple (6-column) or extended/consensus (10/13-column) pileups. Simple pileup features will
+ * contain only basic information, no observed alleles or variant/genotype inferences, and so shouldn't be used as
+ * input for analysis that requires that information.
+ *
 * @author mhanna
 * @version 0.1
 */
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java
@ -33,18 +33,38 @@ import java.util.Collections;
 /**
 * Run validating pileup across a set of core data as proof of the integrity of the GATK core.
 *
- * @author mhanna
- * @version 0.1
+ * Tests both types of old-school pileup formats (basic and consensus).
+ *
+ * @author mhanna, vdauwera
+ * @version 0.2
 */
 public class CheckPileupIntegrationTest extends WalkerTest {
+    /**
+     * This test runs on a consensus pileup containing 10-column lines for SNPs and 13-column lines for indels
+     */
    @Test(enabled = true)
-    public void testEcoliThreaded() {
+    public void testEcoliConsensusPileup() {
        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                "-T CheckPileup" +
                " -I " + validationDataLocation + "MV1994.selected.bam" +
                " -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" +
                " --pileup:SAMPileup "+ validationDataLocation + "MV1994.selected.pileup" +
                " -S SILENT -nt 8",0, Collections.<String>emptyList());
-        executeTest("testEcoliThreaded",spec);
+        executeTest("testEcoliConsensusPileup",spec);
+    }
+
+    /**
+     * This test runs on a basic pileup containing 6-column lines for all variants  TODO
+     */
+    @Test
+    public void testEcoliBasicPileup() {
+        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
+                "-T CheckPileup" +
+                        " -I " + validationDataLocation + "MV1994.selected.bam" +
+                        " -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" +
+                        " --pileup:SAMPileup "+ validationDataLocation + "MV1994.basic.pileup" +
+                        " -L Escherichia_coli_K12:1-49" +
+                        " -S SILENT -nt 8",0, Collections.<String>emptyList());
+        executeTest("testEcoliBasicPileup",spec);
    }
 }