diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java
index 533c7be73..b6a3853f8 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java
@@ -49,20 +49,109 @@ import java.io.PrintStream;
import java.util.Arrays;
/**
- * At every locus in the input set, compares the pileup data (reference base, aligned base from
- * each overlapping read, and quality score) to the reference pileup data generated by samtools. Samtools' pileup data
- * should be specified using the command-line argument '-pileup:SAMPileup '.
+ * Compare GATK's internal pileup to a reference Samtools pileup
+ *
+ * At every locus in the input set, compares the pileup data (reference base, aligned base from
+ * each overlapping read, and quality score) generated internally by GATK to a reference pileup data generated
+ * by Samtools. Note that the pileup program has been replaced in Samtools by mpileup, which produces a slightly
+ * different output format by default.
+ *
+ *
+ * Format
+ * There are two versions of the original pileup format: the current 6-column format produced by Samtools, and the old
+ * 10-column "consensus" format which could be obtained by using the -c argument, now deprecated.
+ * Simple pileup: 6-column format
+ *
+ * Each line consists of chromosome, 1-based coordinate, reference base, the
+ * number of reads covering the site, read bases and base qualities. At the
+ * read base column, a dot stands for a match to the reference base on the
+ * forward strand, a comma for a match on the reverse strand, `ACGTN' for a mismatch
+ * on the forward strand and `acgtn' for a mismatch on the reverse strand.
+ * A pattern `\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between
+ * this reference position and the next reference position. The length of the
+ * insertion is given by the integer in the pattern, followed by the inserted sequence.
+ *
+ *
+ * seq1 272 T 24 ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<&
+ * seq1 273 T 23 ,.....,,.,.,...,,,.,..A <<<;<<<<<<<<<3<=<<<;<<+
+ * seq1 274 T 23 ,.$....,,.,.,...,,,.,... 7<7;<;<<<<<<<<<=<;<;<<6
+ * seq1 275 A 23 ,$....,,.,.,...,,,.,...^l. <+;9*<<<<<<<<<=<<:;<<<<
+ * seq1 276 G 22 ...T,,.,.,...,,,.,.... 33;+<<7=7<<7<&<<1;<<6<
+ * seq1 277 T 22 ....,,.,.,.C.,,,.,..G. +7<;<<<<<<<&<=<<:;<<&<
+ * seq1 278 G 23 ....,,.,.,...,,,.,....^k. %38*<<;<7<<7<=<<<;<<<<<
+ * seq1 279 C 23 A..T,,.,.,...,,,.,..... ;75&<<<<<<<<<=<<<9<<:<<
+ *
+ *
+ * See the Pileup format documentation for more details.
+ *
+ *
+ * Consensus pileup: 10/13-column format
+ * The "consensus" or extended pileup consists of the following:
+ *
+ * - original 6 columns as described above
+ * - 4 extra columns representing consensus values (consensus base, consensus quality, variant quality and maximum mapping quality of the
+ * reads covering the sites) for all sites, inserted before the bases and quality strings
+ * - 3 extra columns indicating counts of reads supporting indels (just for indel sites)
+ *
+ *
+ * Example of consensus pileup for SNP or non-variant sites
+ *
+ * seq1 60 T T 66 0 99 13 ...........^~.^~. 9<<55<;<<<<<<
+ * seq1 61 G G 72 0 99 15 .............^~.^y. (;975&;<<<<<<<<
+ * seq1 62 T T 72 0 99 15 .$.............. <;;,55;<<<<<<<<
+ * seq1 63 G G 72 0 99 15 .$.............^~. 4;2;<7:+<<<<<<<
+ * seq1 64 G G 69 0 99 14 .............. 9+5<;;;<<<<<<<
+ * seq1 65 A A 69 0 99 14 .$............. <5-2<;;<<<<<<;
+ * seq1 66 C C 66 0 99 13 ............. &*<;;<<<<<<8<
+ * seq1 67 C C 69 0 99 14 .............^~. ,75<.4<<<<<-<<
+ * seq1 68 C C 69 0 99 14 .............. 576<;7<<<<<8<< *
+ *
+ *
+ * Example of consensus pileup for indels
+ *
+ * Escherichia_coli_K12 3995037 * *\/* 430 0 37 144 * +A 143 1 0
+ * Escherichia_coli_K12 3995279 * *\/* 202 0 36 68 * +A 67 1 0
+ * Escherichia_coli_K12 3995281 * *\/* 239 0 36 67 * -CG 66 1 0
+ *
+ *
+ * See Consensus pileup format (deprecated) for more details.
+ *
+ *
+ * Input
+ * A BAM file conatining your aligned sequence data and a pileup file generated by Samtools covering the region you
+ * want to examine.
+ *
+ * Output
+ * A text file listing mismatches between the input pileup and the GATK's internal pileup. If there are no mismatches, the output file is empty.
+ *
+ * Example
+ *
+ * java -jar GenomeAnalysisTK.jar \
+ * -T CheckPileup \
+ * -R ref.fasta \
+ * -I your_data.bam \
+ * --pileup:SAMPileup pileup_file.txt \
+ * -L chr1:257-275 \
+ * -o output_file_name
+ *
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
@Requires(value={DataSource.READS,DataSource.REFERENCE})
public class CheckPileup extends LocusWalker implements TreeReducible {
- @Input(fullName = "pileup", doc="The SAMPileup containing the expected output", required = true)
+ /**
+ * This is the existing pileup against which we'll compare GATK's internal pileup at each genome position in the desired interval.
+ */
+ @Input(fullName = "pileup", shortName = "pileup", doc="Pileup generated by Samtools", required = true)
RodBinding pileup;
@Output
private PrintStream out;
-
- @Argument(fullName="continue_after_error",doc="Continue after an error",required=false)
+ /**
+ * By default the program will quit if it encounters an error (such as missing truth data for a given position).
+ * Use this flag to override the default behavior; the program will then simply print an error message and move on
+ * to the next position.
+ */
+ @Argument(fullName="continue_after_error",doc="Continue after encountering an error",required=false)
public boolean CONTINUE_AFTER_AN_ERROR = false;
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
@@ -72,7 +161,7 @@ public class CheckPileup extends LocusWalker implemen
if ( truePileup == null ) {
out.printf("No truth pileup data available at %s%n", pileup.getPileupString(ref.getBaseAsChar()));
if ( ! CONTINUE_AFTER_AN_ERROR ) {
- throw new UserException.CommandLineException(String.format("No pileup data available at %s given GATK's output of %s -- this walker requires samtools pileup data over all bases",
+ throw new UserException.BadInput(String.format("No pileup data available at %s given GATK's output of %s -- this walker requires samtools pileup data over all bases",
context.getLocation(), new String(pileup.getBases())));
}
} else {
@@ -80,7 +169,7 @@ public class CheckPileup extends LocusWalker implemen
if ( pileupDiff != null ) {
out.printf("%s vs. %s%n", pileup.getPileupString(ref.getBaseAsChar()), truePileup.getPileupString());
if ( ! CONTINUE_AFTER_AN_ERROR ) {
- throw new RuntimeException(String.format("Pileups aren't equal: %s", pileupDiff));
+ throw new UserException.BadInput(String.format("The input pileup doesn't match the GATK's internal pileup: %s", pileupDiff));
}
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java
index 23bbf1460..48e21fdd0 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java
@@ -48,11 +48,17 @@ import java.util.List;
/**
* Emulates the samtools pileup command to print aligned reads
*
- * Prints the alignment in something similar to the samtools pileup format. Each line represents a genomic position,
- * consisting of chromosome name, coordinate, reference base, read bases, and read qualities.
+ *
Prints the alignment in something similar to the Samtools pileup format (see the
+ * Pileup format documentation for more details about
+ * the original format). There is one line per genomic position, listing the chromosome name, coordinate, reference
+ * base, read bases, and read qualities. In addition to these default fields, additional information can be added to
+ * the output as extra columns; see options detailed below.
*
- * Emulated command:
- * samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list] [-iscg] [-T theta] [-N nHap] [-r pairDiffRate]
+ * Emulated command:
+ *
+ * samtools pileup -f in.ref.fasta -l in.site_list input.bam
+ *
+
*
* Input
*
@@ -61,17 +67,32 @@ import java.util.List;
*
*
Output
*
- * Formatted pileup-style alignment of reads.
+ * Alignment of reads formatted in the Pileup style.
*
*
* Example
*
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -T Pileup \
- * -R ref.fasta \
- * -I aligned_reads.bam \
+ * -R exampleFASTA.fasta \
+ * -I exampleBAM.bam \
+ * -L chr1:257-267
* -o output.txt
*
+ * Expected output
+ *
+ * chr1 257 A CAA '&=
+ * chr1 258 C TCC A:=
+ * chr1 259 C CCC )A=
+ * chr1 260 C ACC (=<
+ * chr1 261 T TCT '44
+ * chr1 262 A AAA '?:
+ * chr1 263 A AGA 1'6
+ * chr1 264 C TCC 987
+ * chr1 265 C CCC (@(
+ * chr1 266 C GCC ''=
+ * chr1 267 T AAT 7%>
+ *
*
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
@@ -83,18 +104,25 @@ public class Pileup extends LocusWalker implements TreeReducibl
PrintStream out;
/**
- * In addition to the standard pileup output, adds 'verbose' output too. The verbose output contains the number of spanning deletions,
+ * In addition to the standard pileup output, adds 'verbose' output too. The verbose output contains the number of spanning deletions,
* and for each read in the pileup it has the read name, offset in the base string, read length, and read mapping quality. These per
* read items are delimited with an '@' character.
*/
@Argument(fullName="showVerbose",shortName="verbose",doc="Add an extra verbose section to the pileup output", required=false)
public boolean SHOW_VERBOSE = false;
-
- @Input(fullName="metadata",shortName="metadata",doc="Add these ROD bindings to the output Pileup", required=false)
+ /**
+ * This enables annotating the pileup to show overlaps with metadata from a ROD file.
+ * For example, if you provide a VCF and there is a SNP at a given location covered by the pileup, the pileup
+ * output at that position will be annotated with the corresponding source ROD identifier.
+ */
+ @Input(fullName="metadata",shortName="metadata",doc="ROD file containing metadata", required=false)
public List> rods = Collections.emptyList();
-
+ /**
+ * Adds the length of the insert each base comes from to the output pileup. Here, "insert" refers to the DNA insert
+ * produced during library generation before sequencing.
+ */
@Hidden
- @Argument(fullName="outputInsertLength",shortName = "outputInsertLength",doc="Add a column which contains the length of the insert each base comes from.",required=false)
+ @Argument(fullName="outputInsertLength",shortName = "outputInsertLength",doc="Output insert length",required=false)
public boolean outputInsertLength=false;
@Override
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java
index 34705c4c9..70241a6c4 100644
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java
@@ -37,13 +37,21 @@ import java.util.regex.Pattern;
import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.VariantType;
/**
- * Decoder for SAM pileup data. For GATK validation purposes only
+ * Decoder for SAM pileup data.
*
*
- * Pileup format is first used by Tony Cox and Zemin Ning at the Sanger Institute.
- * It desribes the base-pair information at each chromosomal position. This format
- * facilitates SNP/indel calling and brief alignment viewing by eyes.
+ * From the SAMTools project documentation:
*
+ * The Pileup format was first used by Tony Cox and Zemin Ning at
+ * the Sanger Institute. It describes the base-pair information at each chromosomal position. This format
+ * facilitates SNP/indel calling and brief alignment viewing by eye. Note that the pileup program has been replaced
+ * in Samtools by mpileup, which produces a slightly different output format by default.
+ *
+
+ * Format
+ * There are two versions of the original pileup format: the current 6-column format produced by Samtools, and the old
+ * 10/13-column "consensus" format which could be obtained by using the -c argument, now deprecated.
+ * Simple pileup: 6-column format
*
* Each line consists of chromosome, 1-based coordinate, reference base, the
* number of reads covering the site, read bases and base qualities. At the
@@ -54,13 +62,6 @@ import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.V
* this reference position and the next reference position. The length of the
* insertion is given by the integer in the pattern, followed by the inserted sequence.
*
- *
- *
- *
See also: @see SAMTools project
- *
See also: @see Pileup format
- *
- *
- * File format example
*
* seq1 272 T 24 ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<&
* seq1 273 T 23 ,.....,,.,.,...,,,.,..A <<<;<<<<<<<<<3<=<<<;<<+
@@ -71,15 +72,55 @@ import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.V
* seq1 278 G 23 ....,,.,.,...,,,.,....^k. %38*<<;<7<<7<=<<<;<<<<<
* seq1 279 C 23 A..T,,.,.,...,,,.,..... ;75&<<<<<<<<<=<<<9<<:<<
*
+ *
+ * See the Pileup format documentation for more details.
+ *
*
- * @author Matt Hanna
- * @since 2009
+ * Consensus pileup: 10/13-column format
+ * The "consensus" or extended pileup consists of the following:
+ *
+ * - original 6 columns as described above
+ * - 4 extra columns representing consensus values (consensus base, consensus quality, variant quality and maximum mapping quality of the
+ * reads covering the sites) for all sites, inserted before the bases and quality strings
+ * - 3 extra columns indicating counts of reads supporting indels (just for indel sites)
+ *
+ *
+ * Example of consensus pileup for SNP or non-variant sites
+ *
+ * seq1 60 T T 66 0 99 13 ...........^~.^~. 9<<55<;<<<<<<
+ * seq1 61 G G 72 0 99 15 .............^~.^y. (;975&;<<<<<<<<
+ * seq1 62 T T 72 0 99 15 .$.............. <;;,55;<<<<<<<<
+ * seq1 63 G G 72 0 99 15 .$.............^~. 4;2;<7:+<<<<<<<
+ * seq1 64 G G 69 0 99 14 .............. 9+5<;;;<<<<<<<
+ * seq1 65 A A 69 0 99 14 .$............. <5-2<;;<<<<<<;
+ * seq1 66 C C 66 0 99 13 ............. &*<;;<<<<<<8<
+ * seq1 67 C C 69 0 99 14 .............^~. ,75<.4<<<<<-<<
+ * seq1 68 C C 69 0 99 14 .............. 576<;7<<<<<8<< *
+ *
+ *
+ * Example of consensus pileup for indels
+ *
+ * Escherichia_coli_K12 3995037 * *\/* 430 0 37 144 * +A 143 1 0
+ * Escherichia_coli_K12 3995279 * *\/* 202 0 36 68 * +A 67 1 0
+ * Escherichia_coli_K12 3995281 * *\/* 239 0 36 67 * -CG 66 1 0
+ *
+ *
+ * See Consensus pileup format (deprecated) for more details.
+ *
+ *
+ * Caveat
+ * Handling of indels is questionable at the moment. Proceed with care.
+ *
+ *
+ * @author Matt Hanna, Geraldine VdAuwera
+ * @since 2014
*/
public class SAMPileupCodec extends AsciiFeatureCodec {
- // the number of tokens we expect to parse from a pileup line
- private static final int expectedTokenCount = 10;
+ // number of tokens expected (6 or 10 are valid, anything else is wrong)
+ private static final int basicTokenCount = 6;
+ private static final int consensusSNPTokenCount = 10;
+ private static final int consensusIndelTokenCount = 13;
private static final char fldDelim = '\t';
-
// allocate once and don't ever bother creating them again:
private static final String baseA = "A";
private static final String baseC = "C";
@@ -92,74 +133,110 @@ public class SAMPileupCodec extends AsciiFeatureCodec {
}
public SAMPileupFeature decode(String line) {
-// 0 1 2 3 4 5 6 7
-//* chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow)
-//* chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6
- String[] tokens = new String[expectedTokenCount];
+ //+1 because we want to know if we have more than the max
+ String[] tokens = new String[consensusIndelTokenCount+1];
// split the line
- int count = ParsingUtils.split(line,tokens,fldDelim);
-
- // check to see if we've parsed the string into the right number of tokens (expectedTokenCount)
- if (count != expectedTokenCount)
- throw new CodecLineParsingException("the SAM pileup line didn't have the expected number of tokens " +
- "(expected = " + expectedTokenCount + ", saw = " + count + " on " +
- "line = " + line + ")");
+ final int count = ParsingUtils.split(line,tokens,fldDelim);
SAMPileupFeature feature = new SAMPileupFeature();
+ /**
+ * Tokens 0, 1, 2 are the same for both formats so they will be interpreted without differentiation.
+ * The 10/13-format has 4 tokens inserted after token 2 compared to the 6-format, plus 3 more tokens added at
+ * the end for indels. We are currently not making any use of the extra indel tokens.
+ *
+ * Any token count other than basicTokenCount, consensusSNPTokenCount or consensusIndelTokenCount is wrong.
+ */
+ final String observedString, bases, quals;
+
feature.setChr(tokens[0]);
feature.setStart(Integer.parseInt(tokens[1]));
- if(tokens[2].length() != 1)
+ if(tokens[2].length() != 1) {
throw new CodecLineParsingException("The SAM pileup line had unexpected base " + tokens[2] + " on line = " + line);
- feature.setRef(Character.toUpperCase(tokens[2].charAt(0)));
-
- String observedString = tokens[3].toUpperCase(); // field 3
- feature.setFWDAlleles(new ArrayList(2));
-
- feature.setConsensusConfidence(Double.parseDouble(tokens[4]));
- feature.setVariantConfidence(Double.parseDouble(tokens[5]));
-
- if ( feature.getRef() == '*' ) {
- parseIndels(observedString,feature) ;
- if ( feature.isDeletion() ) feature.setEnd(feature.getStart()+feature.length()-1);
- else feature.setEnd(feature.getStart()); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!!
- } else {
- parseBasesAndQuals(feature,tokens[8],tokens[9]);
- // if the variant is a SNP or a reference base (i.e. no variant at all)
- if ( observedString.length() != 1 ) throw new RuntimeException( "point mutation genotype is expected to be represented by a single letter");
- feature.setRefBases(tokens[2].toUpperCase());
- feature.setEnd(feature.getStart());
-
- char ch = observedString.charAt(0);
-
- switch ( ch ) {
- case 'A': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseA); break;
- case 'C': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseC); break;
- case 'G': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseG); break;
- case 'T': feature.getFWDAlleles().add(baseT); feature.getFWDAlleles().add(baseT); break;
- case 'M': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseC); break;
- case 'R': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseG); break;
- case 'W': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseT); break;
- case 'S': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseG); break;
- case 'Y': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseT); break;
- case 'K': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseT); break;
- }
- if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() && feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ) feature.setVariantType(VariantType.NONE);
- else {
- // we know that at least one allele is non-ref;
- // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e.
- // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref)
- feature.setVariantType(VariantType.SNP);
- if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() ||
- feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ||
- feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1))
- ) feature.setNumNonRef(1);
- else feature.setNumNonRef(2); // if both observations differ from ref and they are not equal to one another, then we get multiallelic site...
- }
}
+ feature.setRef(tokens[2].charAt(0));
+ switch (count) {
+ case basicTokenCount:
+ bases = tokens[4];
+ quals = tokens[5];
+ // parsing is pretty straightforward for 6-col format
+ if ( feature.getRef() == '*' ) { // this indicates an indel -- but it shouldn't occur with vanilla 6-col format
+ throw new CodecLineParsingException("Found an indel on line = " + line + " but it shouldn't happen in simple pileup format");
+ } else {
+ parseBasesAndQuals(feature, bases, quals);
+ feature.setRefBases(tokens[2].toUpperCase());
+ feature.setEnd(feature.getStart());
+ }
+ break;
+ case consensusSNPTokenCount: // pileup called a SNP or a reference base
+ observedString = tokens[3].toUpperCase();
+ feature.setFWDAlleles(new ArrayList(2));
+ feature.setConsensusConfidence(Double.parseDouble(tokens[4]));
+ feature.setVariantConfidence(Double.parseDouble(tokens[5]));
+ bases = tokens[8];
+ quals = tokens[9];
+ // confirm that we have a non-variant, not a mis-parsed indel
+ if ( feature.getRef() == '*' ) {
+ throw new CodecLineParsingException("Line parsing of " + line + " says we have a SNP or non-variant but the ref base is '*', which indicates an indel");
+ }
+ // Parse the SNP or non-variant
+ parseBasesAndQuals(feature, bases, quals);
+ if ( observedString.length() != 1 ) {
+ throw new CodecLineParsingException( "Line parsing of " + line + " says we have a SNP or non-variant but the genotype token is not a single letter: " + observedString);
+ }
+ feature.setRefBases(tokens[2].toUpperCase());
+ feature.setEnd(feature.getStart());
+
+ char ch = observedString.charAt(0);
+
+ switch ( ch ) { // record alleles (decompose ambiguous base codes)
+ case 'A': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseA); break;
+ case 'C': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseC); break;
+ case 'G': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseG); break;
+ case 'T': feature.getFWDAlleles().add(baseT); feature.getFWDAlleles().add(baseT); break;
+ case 'M': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseC); break;
+ case 'R': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseG); break;
+ case 'W': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseT); break;
+ case 'S': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseG); break;
+ case 'Y': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseT); break;
+ case 'K': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseT); break;
+ }
+ if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() && feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ) feature.setVariantType(VariantType.NONE);
+ else {
+ // we know that at least one allele is non-ref;
+ // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e.
+ // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref)
+ feature.setVariantType(VariantType.SNP);
+ if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() ||
+ feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ||
+ feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1))
+ ) feature.setNumNonRef(1);
+ else feature.setNumNonRef(2); // if both observations differ from ref and they are not equal to one another, then we get multiallelic site...
+ }
+ break;
+ case consensusIndelTokenCount:
+ observedString = tokens[3].toUpperCase();
+ feature.setFWDAlleles(new ArrayList(2));
+ feature.setConsensusConfidence(Double.parseDouble(tokens[4]));
+ feature.setVariantConfidence(Double.parseDouble(tokens[5]));
+ // confirm that we have an indel, not a mis-parsed SNP or non-variant
+ if ( feature.getRef() != '*' ) {
+ throw new CodecLineParsingException("Line parsing of " + line + " says we have an indel but the ref base is not '*'");
+ }
+ // Parse the indel
+ parseIndels(observedString,feature) ;
+ if ( feature.isDeletion() ) feature.setEnd(feature.getStart()+feature.length()-1);
+ else feature.setEnd(feature.getStart()); // if it's not a deletion and we are biallelic, this has got to be an insertion; otherwise the state is inconsistent!!!!
+ break;
+ default:
+ throw new CodecLineParsingException("The SAM pileup line didn't have the expected number of tokens " +
+ "(expected = " + basicTokenCount + " (basic pileup), " + consensusSNPTokenCount +
+ " (consensus pileup for a SNP or non-variant site) or " + consensusIndelTokenCount +
+ " (consensus pileup for an indel); saw = " + count + " on line = " + line + ")");
+ }
return feature;
}
@@ -197,7 +274,7 @@ public class SAMPileupCodec extends AsciiFeatureCodec {
else feature.setVariantType(VariantType.DELETION);
feature.setRefBases(varBases); // remember what was deleted, this will be saved as "reference allele"
break;
- default: throw new RuntimeException("Can not interpret observed indel allele record: "+genotype);
+ default: throw new CodecLineParsingException("Can not interpret observed indel allele record: "+genotype);
}
feature.getFWDAlleles().add(varBases);
feature.setLength(obs[i].length()-1); // inconsistent for non-biallelic indels!!
@@ -224,7 +301,7 @@ public class SAMPileupCodec extends AsciiFeatureCodec {
{
//System.out.printf("%s%n%s%n", bases, quals);
- // needs to convert the base string with it's . and , to the ref base
+ // needs to convert the base string with its . and , to the ref base
StringBuilder baseBuilder = new StringBuilder();
StringBuilder qualBuilder = new StringBuilder();
boolean done = false;
@@ -254,7 +331,7 @@ public class SAMPileupCodec extends AsciiFeatureCodec {
Matcher match = regex.matcher(rest);
if ( ! match.matches() ) {
if ( feature.getRef() != '*' )
- throw new RuntimeException("Bad pileup format: " + bases + " at position " + i);
+ throw new CodecLineParsingException("Bad pileup format: " + bases + " at position " + i);
done = true;
}
else {
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java
index a6fd996fd..287363601 100644
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java
@@ -33,6 +33,10 @@ import java.util.List;
/**
* A tribble feature representing a SAM pileup.
*
+ * Allows intake of both simple (6-column) or extended/consensus (10/13-column) pileups. Simple pileup features will
+ * contain only basic information, no observed alleles or variant/genotype inferences, and so shouldn't be used as
+ * input for analysis that requires that information.
+ *
* @author mhanna
* @version 0.1
*/
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java
index 4d3741228..0971cb90b 100644
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java
@@ -33,18 +33,38 @@ import java.util.Collections;
/**
* Run validating pileup across a set of core data as proof of the integrity of the GATK core.
*
- * @author mhanna
- * @version 0.1
+ * Tests both types of old-school pileup formats (basic and consensus).
+ *
+ * @author mhanna, vdauwera
+ * @version 0.2
*/
public class CheckPileupIntegrationTest extends WalkerTest {
+ /**
+ * This test runs on a consensus pileup containing 10-column lines for SNPs and 13-column lines for indels
+ */
@Test(enabled = true)
- public void testEcoliThreaded() {
+ public void testEcoliConsensusPileup() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T CheckPileup" +
" -I " + validationDataLocation + "MV1994.selected.bam" +
" -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" +
" --pileup:SAMPileup "+ validationDataLocation + "MV1994.selected.pileup" +
" -S SILENT -nt 8",0, Collections.emptyList());
- executeTest("testEcoliThreaded",spec);
+ executeTest("testEcoliConsensusPileup",spec);
+ }
+
+ /**
+ * This test runs on a basic pileup containing 6-column lines for all variants TODO
+ */
+ @Test
+ public void testEcoliBasicPileup() {
+ WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
+ "-T CheckPileup" +
+ " -I " + validationDataLocation + "MV1994.selected.bam" +
+ " -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" +
+ " --pileup:SAMPileup "+ validationDataLocation + "MV1994.basic.pileup" +
+ " -L Escherichia_coli_K12:1-49" +
+ " -S SILENT -nt 8",0, Collections.emptyList());
+ executeTest("testEcoliBasicPileup",spec);
}
}